def obtain_samples(self, deterministic=False, max_samples=np.inf, max_trajs=np.inf, accum_context=True, resample=1): """ Obtains samples in the environment until either we reach either max_samples transitions or num_traj trajectories. The resample argument specifies how often (in trajectories) the agent will resample it's context. """ assert max_samples < np.inf or max_trajs < np.inf, "either max_samples or max_trajs must be finite" policy = MakeDeterministic( self.policy) if deterministic else self.policy paths = [] n_steps_total = 0 n_trajs = 0 while n_steps_total < max_samples and n_trajs < max_trajs: path = rollout(self.env, policy, max_path_length=self.max_path_length, accum_context=accum_context) # save the latent context that generated this trajectory path['context'] = policy.z.detach().cpu().numpy() paths.append(path) n_steps_total += len(path['observations']) n_trajs += 1 # don't we also want the option to resample z ever transition? if n_trajs % resample == 0: policy.sample_z() return paths, n_steps_total
def obtain_samples(self, deterministic=False, max_samples=np.inf, max_trajs=np.inf, accum_context=True, resample=1, testing=False): assert max_samples < np.inf or max_trajs < np.inf, "either max_samples or max_trajs must be finite" policy = MakeDeterministic( self.policy) if deterministic else self.policy paths = [] n_steps_total = 0 n_trajs = 0 if self.itr <= self.num_train_itr: if self.tandem_train: self._train(policy, accum_context) self.itr += 1 else: for _ in range(self.num_train_itr): self._train(policy, accum_context) self.itr += 1 while n_steps_total < max_samples and n_trajs < max_trajs: if testing: path = rollout(self.env, policy, max_path_length=self.max_path_length, accum_context=accum_context) else: path = rollout(self.model, policy, max_path_length=self.max_path_length, accum_context=accum_context) # save the latent context that generated this trajectory path['context'] = policy.z.detach().cpu().numpy() paths.append(path) n_steps_total += len(path['observations']) n_trajs += 1 # don't we also want the option to resample z ever transition? if n_trajs % resample == 0: policy.sample_z() return paths, n_steps_total