def load_paths(self): paths = [] for i in range(len(self.data)): p = self.data[i] H = len(p["observations"]) - 1 path_builder = PathBuilder() for t in range(H): p["observations"][t] ob = path["observations"][t, :] action = path["actions"][t, :] reward = path["rewards"][t] next_ob = path["observations"][t + 1, :] terminal = 0 agent_info = {} # todo (need to unwrap each key) env_info = {} # todo (need to unwrap each key) path_builder.add_all( observations=ob, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) path = path_builder.get_all_stacked() paths.append(path) return paths
def rollout( env, policy, max_path_length, no_terminal=False, render=False, render_kwargs={}, ): path_builder = PathBuilder() observation = env.reset() for _ in range(max_path_length): action, agent_info = policy.get_action(observation) if render: env.render(**render_kwargs) next_ob, reward, terminal, env_info = env.step(action) if no_terminal: terminal = False path_builder.add_all( observations=observation, actions=action, rewards=np.array([reward]), next_observations=next_ob, terminals=np.array([terminal]), absorbing=np.array([0., 0.]), agent_info=agent_info, env_info=env_info, ) observation = next_ob if terminal: break return path_builder
def _start_new_rollout(self, env_idx=None): if env_idx is None: self._current_path_builders = [PathBuilder() for _ in range(self._env_num)] self._obs = self._env.reset() else: self._current_path_builders[env_idx] = PathBuilder() self._obs[env_idx] = self._env.reset(env_idx)[env_idx]
def train_online(self, start_epoch=0): # No need for training mode to be True when generating trajectories # training mode is automatically set to True # in _try_to_train and before exiting # it that function it reverts it to False self.training_mode(False) self._current_path_builder = PathBuilder() self._n_rollouts_total = 0 for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) print('EPOCH STARTED') # print('epoch') for _ in range(self.num_rollouts_per_epoch): # print('rollout') task_params, obs_task_params = self.train_task_params_sampler.sample( ) self.generate_exploration_rollout( task_params=task_params, obs_task_params=obs_task_params) # print(self._n_rollouts_total) if self._n_rollouts_total % self.num_rollouts_between_updates == 0: gt.stamp('sample') # print('train') if not self.do_not_train: self._try_to_train(epoch) gt.stamp('train') if not self.do_not_eval: self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch()
def train_batch(self, start_epoch): self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) observation = self._start_new_rollout() # This implementation is rather naive. If you want to (e.g.) # parallelize data collection, this would be the place to do it. for i in range(self.num_env_steps_per_epoch): observation, terminal = self._take_step_in_env(observation) #print(i, terminal) assert terminal[0] == True gt.stamp('sample') self._try_to_train() gt.stamp('train') set_to_eval_mode(self.env) #print(i, terminal) self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch)
def rollout_path(env, task_params, obs_task_params, post_cond_policy): cur_eval_path_builder = PathBuilder() # reset the env using the params observation = env.reset(task_params=task_params, obs_task_params=obs_task_params) terminal = False task_identifier = env.task_identifier while (not terminal) and len(cur_eval_path_builder) < MAX_PATH_LENGTH: agent_obs = observation['obs'] action, agent_info = post_cond_policy.get_action(agent_obs) next_ob, raw_reward, terminal, env_info = (env.step(action)) terminal = False reward = raw_reward terminal = np.array([terminal]) reward = np.array([reward]) cur_eval_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, task_identifiers=task_identifier ) observation = next_ob return cur_eval_path_builder.get_all_stacked()
def train_online(self, start_epoch=0): # No need for training mode to be True when generating trajectories # training mode is automatically set to True # in _try_to_train and before exiting # it that function it reverts it to False self.training_mode(False) self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) for _ in range(self.num_rollouts_per_epoch): task_params, obs_task_params = self.train_task_params_sampler.sample( ) self.generate_exploration_rollout( task_params=task_params, obs_task_params=obs_task_params) # essentially in each epoch we gather data then do a certain amount of training gt.stamp('sample') if not self.do_not_train: self._try_to_train() gt.stamp('train') if epoch % self.freq_eval == 0: # and then we evaluate it if not self.do_not_eval: self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch()
def train_online(self, start_epoch=0): if not self.environment_farming: observation = self._start_new_rollout() self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) for _ in range(self.num_env_steps_per_epoch): if not self.environment_farming: observation = self.play_one_step(observation) else: # acquire a remote environment remote_env = self.farmer.force_acq_env() self.play_ignore(remote_env) # Training out of threads self._try_to_train() gt.stamp('train') if epoch % 10 == 0: self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch()
def test_path_length(self): path = PathBuilder() for _ in range(10): path.add_all( action=np.array([1, 2, 3]), obs=-np.array([1, 2, 3]), ) self.assertEqual(len(path), 10)
def load_path(self, path, replay_buffer, obs_dict=None): rewards = [] path_builder = PathBuilder() print("loading path, length", len(path["observations"]), len(path["actions"])) H = min(len(path["observations"]), len(path["actions"])) print("actions", np.min(path["actions"]), np.max(path["actions"])) for i in range(H): if obs_dict: ob = path["observations"][i][self.obs_key] next_ob = path["next_observations"][i][self.obs_key] else: ob = path["observations"][i] next_ob = path["next_observations"][i] if i == 0: current_obs = np.zeros((self.stack_obs + 1, len(ob))) current_obs[-2, :] = ob current_obs[-1, :] = next_ob else: current_obs = np.vstack((current_obs[1:, :], next_ob)) assert (current_obs[-2, :] == ob ).all(), "mismatch between obs and next_obs" obs1 = current_obs[:self.stack_obs, :].flatten() obs2 = current_obs[1:, :].flatten() action = path["actions"][i] reward = path["rewards"][i] terminal = path["terminals"][i] if not self.load_terminals: terminal = np.zeros(terminal.shape) agent_info = path["agent_infos"][i] env_info = path["env_infos"][i] if self.recompute_reward: reward = self.env.compute_reward( action, next_ob, ) reward = np.array([reward]) rewards.append(reward) terminal = np.array([terminal]).reshape((1, )) path_builder.add_all( observations=obs1, actions=action, rewards=reward, next_observations=obs2, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.demo_trajectory_rewards.append(rewards) path = path_builder.get_all_stacked() replay_buffer.add_path(path) print("path sum rewards", sum(rewards), len(rewards))
def _handle_rollout_ending(self): """ Implement anything that needs to happen after every rollout. """ self.replay_buffer.terminate_episode() self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: self._exploration_paths.append(self._current_path_builder) self._current_path_builder = PathBuilder()
def load_path(self, path, replay_buffer, obs_dict=None): rewards = [] path_builder = PathBuilder() H = min(len(path["observations"]), len(path["actions"])) if obs_dict: traj_obs = self.preprocess(path["observations"]) next_traj_obs = self.preprocess(path["next_observations"]) else: traj_obs = self.env.encode(path["observations"]) next_traj_obs = self.env.encode(path["next_observations"]) for i in range(H): ob = traj_obs[i] next_ob = next_traj_obs[i] action = path["actions"][i] # #temp fix# # ob['state_desired_goal'] = np.zeros_like(ob['state_desired_goal']) # ob['latent_desired_goal'] = np.zeros_like(ob['latent_desired_goal']) # next_ob['state_desired_goal'] = np.zeros_like(next_ob['state_desired_goal']) # next_ob['latent_desired_goal'] = np.zeros_like(next_ob['latent_desired_goal']) # action[3] /= 5 # #temp fix# reward = path["rewards"][i] terminal = path["terminals"][i] if not self.load_terminals: terminal = np.zeros(terminal.shape) agent_info = path["agent_infos"][i] env_info = path["env_infos"][i] if self.reward_fn: reward = self.reward_fn(ob, action, next_ob, next_ob) reward = np.array([reward]).flatten() rewards.append(reward) terminal = np.array([terminal]).reshape((1, )) path_builder.add_all( observations=ob, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.demo_trajectory_rewards.append(rewards) path = path_builder.get_all_stacked() replay_buffer.add_path(path) print("rewards", np.min(rewards), np.max(rewards)) print("loading path, length", len(path["observations"]), len(path["actions"])) print("actions", np.min(path["actions"]), np.max(path["actions"])) print("path sum rewards", sum(rewards), len(rewards))
def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() observation = self._start_new_rollout() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) for _ in range(self.num_env_steps_per_epoch): action, agent_info = self._get_action_and_info(observation, ) if self.render: self.training_env.render() next_ob, raw_reward, terminal, env_info = ( self.training_env.step(action)) self._n_env_steps_total += 1 reward = raw_reward * self.reward_scale terminal = np.array([terminal]) reward = np.array([reward]) self.posterior_state = self.neural_process.update_posterior_state( self.posterior_state, observation[self.extra_obs_dim:], action, reward, next_ob) next_ob = np.concatenate( [self.get_latent_repr(self.posterior_state), next_ob]) self._handle_step( observation, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) if terminal or len( self._current_path_builder) >= self.max_path_length: self._handle_rollout_ending() observation = self._start_new_rollout() else: observation = next_ob gt.stamp('sample') if epoch >= self.epoch_to_start_training: self._try_to_train() gt.stamp('train') if epoch >= self.epoch_to_start_training: self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch()
def load_path(self, path, replay_buffer, obs_dict=None): # Filter data # if not self.data_filter_fn(path): return rewards = [] path_builder = PathBuilder() print("loading path, length", len(path["observations"]), len(path["actions"])) H = min(len(path["observations"]), len(path["actions"])) print("actions", np.min(path["actions"]), np.max(path["actions"])) for i in range(H): if obs_dict: ob = path["observations"][i][self.obs_key] next_ob = path["next_observations"][i][self.obs_key] else: ob = path["observations"][i] next_ob = path["next_observations"][i] action = path["actions"][i] reward = path["rewards"][i] terminal = path["terminals"][i] if not self.load_terminals: terminal = np.zeros(terminal.shape) agent_info = path["agent_infos"][i] env_info = path["env_infos"][i] if self.recompute_reward: reward = self.env.compute_reward( action, next_ob, ) reward = np.array([reward]).flatten() rewards.append(reward) terminal = np.array([terminal]).reshape((1, )) path_builder.add_all( observations=ob, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.demo_trajectory_rewards.append(rewards) path = path_builder.get_all_stacked() replay_buffer.add_path(path) print("path sum rewards", sum(rewards), len(rewards))
def _handle_rollout_ending(self, eval_task=False): """ Implement anything that needs to happen after every rollout. """ if eval_task: self.eval_enc_replay_buffer.terminate_episode(self.task_idx) else: self.replay_buffer.terminate_episode(self.task_idx) self.enc_replay_buffer.terminate_episode(self.task_idx) self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: self._exploration_paths.append( self._current_path_builder.get_all_stacked()) self._current_path_builder = PathBuilder()
def train(self): ''' meta-training loop ''' self.pretrain() params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) gt.reset() gt.set_def_unique(False) self._current_path_builder = PathBuilder() # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate for it_ in gt.timed_for( range(self.num_iterations), save_itrs=True, ): self._start_epoch(it_) self.training_mode(True) if it_ == 0: print('collecting initial pool of data for train and eval') # temp for evaluating for idx in self.train_tasks: self.task_idx = idx self.env.reset_task(idx) self.collect_data(self.num_initial_steps, 1, np.inf) # Sample data from train tasks. for i in range(self.num_tasks_sample): idx = np.random.randint(len(self.train_tasks)) self.task_idx = idx self.env.reset_task(idx) self.enc_replay_buffer.task_buffers[idx].clear() # collect some trajectories with z ~ prior if self.num_steps_prior > 0: self.collect_data(self.num_steps_prior, 1, np.inf) # collect some trajectories with z ~ posterior if self.num_steps_posterior > 0: self.collect_data(self.num_steps_posterior, 1, self.update_post_train) # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior if self.num_extra_rl_steps_posterior > 0: self.collect_data(self.num_extra_rl_steps_posterior, 1, self.update_post_train, add_to_enc_buffer=False) # Sample train tasks and compute gradient updates on parameters. for train_step in range(self.num_train_steps_per_itr): indices = np.random.choice(self.train_tasks, self.meta_batch) self._do_training(indices) self._n_train_steps_total += 1 gt.stamp('train') self.training_mode(False) # eval self._try_to_eval(it_) gt.stamp('eval') self._end_epoch()
def _handle_rollout_ending(self): self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: path = self._current_path_builder.get_all_stacked() self.replay_buffer.add_path(path) self._exploration_paths.append(path) self._current_path_builder = PathBuilder()
def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) observation = self._start_new_rollout() for _ in range(self.num_env_steps_per_epoch): observation = self._take_step_in_env(observation) gt.stamp('sample') self._try_to_fit(epoch) gt.stamp('env_fit') self._try_to_train() gt.stamp('train') self.logger.record_tabular(self.env_loss_key, self.env_loss) set_to_eval_mode(self.env) self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch) self.logger.dump_tabular(with_prefix=False, with_timestamp=False)
def collect_new_steps( self, max_path_length, num_steps, discard_incomplete_paths, random=False, ): steps_collector = PathBuilder() for _ in range(num_steps): self.collect_one_step( max_path_length, discard_incomplete_paths, steps_collector, random, ) return [steps_collector.get_all_stacked()]
def train_batch(self, start_epoch): self._current_path_builder = PathBuilder() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) observation = self._start_new_rollout() # This implementation is rather naive. If you want to (e.g.) # parallelize data collection, this would be the place to do it. for _ in range(self.num_env_steps_per_epoch): observation = self._take_step_in_env(observation) gt.stamp('sample') self._try_to_train() gt.stamp('train') set_to_eval_mode(self.env) self._try_to_eval(epoch) gt.stamp('eval') self._try_to_fit(epoch) gt.stamp('env_fit') self.logger.record_tabular(self.env_loss_key, self.env_loss) self._end_epoch(epoch) self.logger.dump_tabular(with_prefix=False, with_timestamp=False)
def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() observation = self._start_new_rollout() self.sample_z = self.sample_z_vec() observation = np.concatenate([observation, self.sample_z]) for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) for t in range(self.num_env_steps_per_epoch): #print("step", t, "pool", self.replay_buffer.num_steps_can_sample()) observation = self._take_step_in_env(observation) gt.stamp('sample') self._try_to_train() gt.stamp('train') set_to_eval_mode(self.env) self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch)
def _handle_rollout_ending(self): self._n_rollouts_total += 1 if len(self._current_path_builder) > 0: path = self._current_path_builder.get_all_stacked() # self.env.update_rewards(path) self.replay_buffer.add_path(path) self._exploration_paths.append(path) # unneeded, wastes memory self._current_path_builder = PathBuilder()
def rollout_path(env, task_params, obs_task_params, post_cond_policy, max_path_length, task_idx): cur_eval_path_builder = PathBuilder() # reset the env using the params observation = env.reset(task_params=task_params, obs_task_params=obs_task_params) terminal = False task_identifier = env.task_identifier while (not terminal) and len(cur_eval_path_builder) < max_path_length: agent_obs = observation['obs'] action, agent_info = post_cond_policy.get_action(agent_obs) next_ob, raw_reward, terminal, env_info = (env.step(action)) # img = env.render(mode='rgb_array', width=200, height=200) if len(cur_eval_path_builder) % 10 == 0: # img = env.render(mode='rgb_array') env._wrapped_env._get_viewer('rgb_array').render(200, 200, camera_id=0) # window size used for old mujoco-py: data = env._wrapped_env._get_viewer('rgb_array').read_pixels(200, 200, depth=False) # original image is upside-down, so flip it img = data[::-1, :, :] imsave('plots/walker_irl_frames/walker_task_%02d_step_%03d.png' % (task_idx, len(cur_eval_path_builder)), img) terminal = False # print(env_info['l2_dist']) # print('{}: {}'.format(agent_obs[-3:], env_info['l2_dist'])) # print(agent_obs) # print(env_info['l2_dist']) reward = raw_reward terminal = np.array([terminal]) reward = np.array([reward]) cur_eval_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, task_identifiers=task_identifier ) observation = next_ob return cur_eval_path_builder.get_all_stacked()
def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() observation = self._start_new_rollout() #observation = self.concat_state_z(state, self.curr_z) for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) for _ in range(self.num_env_steps_per_epoch): ''' TODO''' ''' append the latent variable here''' action, agent_info = self._get_action_and_info(observation, ) if self.render: self.training_env.render() next_state, raw_reward, terminal, env_info = ( self.training_env.step(action)) # print (terminal) next_ob = self.concat_state_z(next_state, self.curr_z) self._n_env_steps_total += 1 reward = raw_reward * self.reward_scale terminal = np.array([terminal]) reward = np.array([reward]) self._handle_step( observation, action, reward, next_ob, terminal, agent_info=agent_info, env_info=env_info, ) if terminal or len( self._current_path_builder) >= self.max_path_length: self._handle_rollout_ending() observation = self._start_new_rollout() #print ('starting new rollout') else: observation = next_ob gt.stamp('sample') self._try_to_train() gt.stamp('train')
def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() observation = self._start_new_rollout() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) for _ in range(self.num_env_steps_per_epoch): observation = self._take_step_in_env(observation) gt.stamp('sample') self._try_to_train() gt.stamp('train') self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch)
def rollout_path(env, task_params, obs_task_params, post_cond_policy, max_path_length): cur_eval_path_builder = PathBuilder() within_correct = False within_incorrect = False # reset the env using the params observation = env.reset(task_params=task_params, obs_task_params=obs_task_params) terminal = False task_identifier = env.task_identifier while (not terminal) and len(cur_eval_path_builder) < max_path_length: agent_obs = observation['obs'] action, agent_info = post_cond_policy.get_action(agent_obs) next_ob, raw_reward, terminal, env_info = (env.step(action)) terminal = False # print(env_info['l2_dist']) # print('{}: {}'.format(agent_obs[-3:], env_info['l2_dist'])) # print(agent_obs) # print(env_info['l2_dist']) reward = raw_reward terminal = np.array([terminal]) reward = np.array([reward]) cur_eval_path_builder.add_all( observations=observation, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, task_identifiers=task_identifier ) observation = next_ob if env_info['within_radius_of_correct']: within_correct = True if env_info['within_radius_of_incorrect']: within_incorrect = True return within_correct, within_incorrect
def train_online(self, start_epoch=0): self._current_path_builder = PathBuilder() observation = self._start_new_rollout() for epoch in gt.timed_for( range(start_epoch, self.num_epochs), save_itrs=True, ): self._start_epoch(epoch) set_to_train_mode(self.training_env) self.training_mode(True) processes = [] import threading gt.stamp('sample') if self._can_train(): ctx = mp.get_context("spawn") for net in self.networks: # net.cuda() net.share_memory() for rank in range(0, self.heads): p = ctx.Process(target=self.train_head, args=(rank, )) p.start() processes.append(p) for step in range(self.num_env_steps_per_epoch): observation = self._take_step_in_env(observation) #self._try_to_train() for p in processes: p.join() gt.stamp('train') self.training_mode(False) self._n_train_steps_total += self.num_env_steps_per_epoch self.current_behavior_policy = np.random.randint(self.heads) set_to_eval_mode(self.env) self._try_to_eval(epoch) gt.stamp('eval') self._end_epoch(epoch)
def _handle_rollout_ending(self, env=None): """ Implement anything that needs to happen after every rollout. """ #WARNING terminate_episode does NOTHING so it isn't adopted to farming self.replay_buffer.terminate_episode() self._n_rollouts_total += 1 if not self.environment_farming: if len(self._current_path_builder) > 0: self._exploration_paths.append( self._current_path_builder.get_all_stacked()) self._current_path_builder = PathBuilder() elif env: _current_path_builder = env.get_current_path_builder() if _current_path_builder == None: raise '_handle_rollout_ending: env object should have current_path_builder field!' self._exploration_paths.append( _current_path_builder.get_all_stacked()) env.newPathBuilder() else: raise '_handle_rollout_ending: env object should given to the fnc in farming mode!'
def train(self): ''' meta-training loop ''' self.pretrain() gt.reset() gt.set_def_unique(False) self._current_path_builder = PathBuilder() # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate for it_ in gt.timed_for( range(self.num_iterations), save_itrs=True, ): self._start_epoch(it_) self.training_mode(True) # Sample train tasks and compute gradient updates on parameters. batch_idxes = np.random.randint(0, len(self.train_goals), size=self.meta_batch_size) train_batch_obj_id = self.replay_buffers.sample_training_data( batch_idxes, self.use_same_context) for _ in range(self.num_train_steps_per_itr): train_raw_batch = ray.get(train_batch_obj_id) gt.stamp('sample_training_data', unique=False) batch_idxes = np.random.randint(0, len(self.train_goals), size=self.meta_batch_size) # In this way, we can start the data sampling job for the # next training while doing training for the current loop. train_batch_obj_id = self.replay_buffers.sample_training_data( batch_idxes, self.use_same_context) gt.stamp('set_up_sampling', unique=False) train_data = self.construct_training_batch(train_raw_batch) gt.stamp('construct_training_batch', unique=False) self._do_training(train_data) self._n_train_steps_total += 1 gt.stamp('train') self.training_mode(False) # eval self._try_to_eval(it_) gt.stamp('eval') self._end_epoch() if it_ == self.num_iterations: logger.save_itr_params(it_, self.agent.get_snapshot())
def load_path(self, path, replay_buffer): rewards = [] path_builder = PathBuilder() print("loading path, length", len(path["observations"]), len(path["actions"])) H = min(len(path["observations"]), len(path["actions"])) print("actions", np.min(path["actions"]), np.max(path["actions"])) for i in range(H): ob = path["observations"][i] action = path["actions"][i] reward = path["rewards"][i] next_ob = path["next_observations"][i] terminal = path["terminals"][i] agent_info = path["agent_infos"][i] env_info = path["env_infos"][i] if self.recompute_reward: reward = self.env.compute_reward( action, next_ob, ) reward = np.array([reward]) rewards.append(reward) terminal = np.array([terminal]).reshape((1, )) path_builder.add_all( observations=ob, actions=action, rewards=reward, next_observations=next_ob, terminals=terminal, agent_infos=agent_info, env_infos=env_info, ) self.demo_trajectory_rewards.append(rewards) path = path_builder.get_all_stacked() replay_buffer.add_path(path)