def sample_data(self, indices, encoder=False): ''' sample data from replay buffers to construct a training meta-batch ''' # collect data from multiple tasks for the meta-batch obs, actions, rewards, next_obs, terms = [], [], [], [], [] for idx in indices: if encoder: batch = ptu.np_to_pytorch_batch( self.enc_replay_buffer.random_batch( idx, batch_size=self.embedding_batch_size, sequence=self.recurrent)) else: batch = ptu.np_to_pytorch_batch( self.replay_buffer.random_batch( idx, batch_size=self.batch_size)) o = batch['observations'][None, ...] a = batch['actions'][None, ...] if encoder and self.sparse_rewards: # in sparse reward settings, only the encoder is trained with sparse reward r = batch['sparse_rewards'][None, ...] else: r = batch['rewards'][None, ...] no = batch['next_observations'][None, ...] t = batch['terminals'][None, ...] obs.append(o) actions.append(a) rewards.append(r) next_obs.append(no) terms.append(t) obs = torch.cat(obs, dim=0) actions = torch.cat(actions, dim=0) rewards = torch.cat(rewards, dim=0) next_obs = torch.cat(next_obs, dim=0) terms = torch.cat(terms, dim=0) return [obs, actions, rewards, next_obs, terms]
def sample_context(self, indices): ''' sample batch of context from a list of tasks from the replay buffer ''' # make method work given a single task index if not hasattr(indices, '__iter__'): indices = [indices] # what do these batches contain tho? batches = [ ptu.np_to_pytorch_batch( self.enc_replay_buffer.random_batch( idx, batch_size=self.embedding_batch_size, sequence=self.recurrent)) for idx in indices ] # context <- batches context = [ self.unpack_batch(batch, sparse_reward=self.sparse_rewards) for batch in batches ] # group like elements together # unpack the whole thing context = [[x[i] for x in context] for i in range(len(context[0]))] # concatenate the whole thing context = [torch.cat(x, dim=0) for x in context] # full context consists of [obs, act, rewards, next_obs, terms] # if dynamics don't change across tasks, don't include next_obs # don't include terminals in context if self.use_next_obs_in_context: context = torch.cat(context[:-1], dim=2) else: context = torch.cat(context[:-2], dim=2) return context
def prepare_context(self, idx): ''' sample context from replay buffer and prepare it ''' batch = ptu.np_to_pytorch_batch(self.enc_replay_buffer.random_batch(idx, batch_size=self.embedding_batch_size, sequence=self.recurrent)) obs = batch['observations'][None, ...] act = batch['actions'][None, ...] rewards = batch['rewards'][None, ...] context = self.prepare_encoder_data(obs, act, rewards) return context
def get_samples(self): batch = ptu.np_to_pytorch_batch(self.replay_buffer.random_batch(self.batch_size)) o = batch['observations'][None, ...] a = batch['actions'][None, ...] r = batch['rewards'][None, ...] no = batch['next_observations'][None, ...] t = batch['terminals'][None, ...] return o, a, r, no, t
def sample_sac(self, indices): ''' sample batch of training data from a list of tasks for training the actor-critic ''' # this batch consists of transitions sampled randomly from replay buffer # rewards are always dense batches = [ptu.np_to_pytorch_batch(self.replay_buffer.random_batch(idx, batch_size=self.batch_size)) for idx in indices] unpacked = [self.unpack_batch(batch) for batch in batches] # group like elements together unpacked = [[x[i] for x in unpacked] for i in range(len(unpacked[0]))] unpacked = [torch.cat(x, dim=0) for x in unpacked] return unpacked
def sample_low_level(self): '''Sample batch of low level interactions In the form of ([state,goal], primitive action,parameterized reward, next state)''' batch = ptu.np_to_pytorch_batch( self.low_buffer.random_batch(batch_size=self.low_batch_size)) unpacked = [self.unpack_batch(batch)] #puts it into format [o,a,r,s,d] # group like elements together unpacked = [[x[i][0] for x in unpacked] for i in range(len(unpacked[0]))] unpacked = [torch.cat(x, dim=0) for x in unpacked] return unpacked
def sample_high_level(self, indices): '''Sample batch of high level interactions In the form of (Original state,Goal given,Reward received,State achieved)''' batches = [ ptu.np_to_pytorch_batch( self.high_buffer.random_batch(idx, batch_size=self.high_batch_size)) for idx in indices ] unpacked = [self.unpack_batch(batch) for batch in batches] # group like elements together unpacked = [[x[i] for x in unpacked] for i in range(len(unpacked[0]))] unpacked = [torch.cat(x, dim=0) for x in unpacked] return unpacked