def get_batch(self, indices=None, torch_device=None): # TODO fix this assert indices is None num_eps = len(self._datadict.done) # number of episodes indices = np.random.choice(num_eps, self._batch_size, replace=False) sampled_datadict = self._datadict.leaf_apply( lambda list_of_arr: np.stack( [self.hor_chunk(list_of_arr[i]) for i in indices])) inputs = AttrDict() outputs = AttrDict() for key in self._env_spec.observation_names: inputs[key] = sampled_datadict[key] for key in self._env_spec.action_names: inputs[key] = sampled_datadict[key] for key in self._env_spec.output_observation_names: outputs[key] = sampled_datadict[key] outputs.done = sampled_datadict.done.astype(bool) if torch_device is not None: for d in (inputs, outputs): d.leaf_modify(lambda x: torch.from_numpy(x).to(torch_device)) return inputs, outputs # shape is (batch, horizon, name_dim...)
def eval_act_sequence(self, model, action_seq, observations, goals): """ Finds predicted trajectory for a given batch of ac_sequences on given initial obs and prev_obs vectors Arguments: model: the underlying dynamics model observations: dotmap:(N x), initial observations (state, state hist, act hist, latent hist) action_seq: (N x H x dotmap{}), action sequences per initial observation goals: should be shape (N, H+1, dO) or broadcastable Returns: dictionary{ctrl_seq, traj_seq, cost, } """ # TODO implement multiple particles # run the model forward on obs_start all_obs, all_mouts = rollout(self._env_spec, model, observations, action_seq, self._advance_obs_fn) # first unsqueezes and then concats all_obs = AttrDict.leaf_combine_and_apply( all_obs, func=lambda vs: torch.cat(vs, dim=1), map_func=lambda arr: arr.unsqueeze(1)) all_mouts = AttrDict.leaf_combine_and_apply( all_mouts, func=lambda vs: torch.cat(vs, dim=1), map_func=lambda arr: arr.unsqueeze(1)) costs = self._cost_fn(all_obs, goals, action_seq, all_mouts) return AttrDict( trajectory=all_obs, costs=costs # (N,) )
def get_action(self, model, observation, goals, batch=True): """ Args: model (Model): observation (AttrDict): goal (AttrDict): batch (bool): Returns: AttrDict """ if self._bg_policy is not None: action_dict = self._bg_policy.get_action(model, observation, goals, batch=batch) else: action_dict = AttrDict() if batch: act = np.tile(self._latest_action[None], (observation.obs.shape[0], 1)) else: act = self._latest_action # save a copy of the bg action if its there if "act" in action_dict.keys(): action_dict.bg_act = action_dict.act action_dict.act = to_torch(act) return action_dict
def get_batch(self): """ Returns: inputs (AttrDict) outputs (AttrDict) """ raise NotImplementedError inputs = AttrDict() outputs = AttrDict() return inputs, outputs
def _reset_curr_episode(self): self._curr_episode_obs = AttrDict() self._curr_episode_actions = AttrDict() self._curr_episode_goals = AttrDict() for name in self._model.env_spec.observation_names: self._curr_episode_obs[name] = [] for name in self._model.env_spec.action_names: self._curr_episode_actions[name] = [] for name in self._model.env_spec.goal_names: self._curr_episode_goals[name] = [] self._curr_episode_dones = []
def get_batch(self, indices=None, torch_device=None, get_horizon_goals=False, get_action_seq=False, min_idx=0): # TODO fix this num_eps = len(self._datadict.done) # number of episodes if indices is None: assert 0 <= min_idx < self._data_len batch = min(self._data_len - min_idx, self._batch_size) indices = np.random.choice(self._data_len - min_idx, batch, replace=False) indices += min_idx # base index to consider in dataset # get current batch sampled_datadict = self._datadict.leaf_apply(lambda arr: arr[indices]) inputs = AttrDict() outputs = AttrDict() goals = AttrDict() for key in self._env_spec.observation_names: inputs[key] = sampled_datadict[key] for key in self._env_spec.action_names: inputs[key] = sampled_datadict[key] for key in self._env_spec.output_observation_names: outputs[key] = sampled_datadict[key] outputs.done = sampled_datadict.done if torch_device is not None: for d in (inputs, outputs, goals): d.leaf_modify(lambda x: torch.from_numpy(x).to(torch_device)) if get_action_seq: inputs['act_seq'] = torch.from_numpy( self._action_sequences[indices]).to(torch_device) if get_horizon_goals: for key in self._env_spec.goal_names: goals[key] = torch.from_numpy( sampled_datadict[key]).to(torch_device) if get_horizon_goals: return inputs, outputs, goals return inputs, outputs # shape is (batch, horizon, name_dim...)
def _load_np(self): local_dict = AttrDict() if self._input_file is None: for key in self._all_names: local_dict[key] = [] split_indices = np.array([]) else: logger.debug('Loading ' + self._input_file) datadict = np.load(self._input_file, mmap_mode="r", allow_pickle=True) self._data_len = len(datadict['done']) split_indices = np.where( datadict['done'])[0] + 1 # one after each episode ends # remove the last chunk since it will be empty if 0 < self._data_len == split_indices[-1]: split_indices = np.delete(split_indices, -1) for key in self._all_names: assert key in datadict, f'{key} not in np file' assert len(datadict[key]) == self._data_len # split by episode local_dict[key] = np.split(datadict[key], split_indices) logger.debug('Dataset length: {}'.format(self._data_len)) return local_dict, split_indices
def forward(self, inputs, obs_lowd=None): """ Given inputs, map them to the appropriate latent distribution :param inputs (AttrDict): holds obs, prev_obs, prev_act, latent and "act" :param training (bool): :return: AttrDict: parametrizes distribution of latents, holds mu, log_sigma """ assert hasattr(inputs, 'latent') assert inputs.latent.dtype in [torch.short, torch.int, torch.long], \ "Latent is type: " + str(inputs.latent.type()) orig = inputs.latent.view( inputs.latent.shape[0]) # should be (batch, 1) # map latent classes to mu, log_sig mus = [] log_sigs = [] for latent_class in orig: # -1 class specifies online inference if latent_class.item() == -1: mus.append(self.online_mu) log_sigs.append(self.online_log_sigma) else: mus.append(self.__getattr__("mu_%d" % latent_class.item())) log_sigs.append( self.__getattr__("log_sigma_%d" % latent_class.item())) mu = torch.stack(mus) log_sigma = torch.stack(log_sigs) # torch_distribution = D.normal.Normal(loc=mu, scale=log_sigma.exp()) # sample = torch_distribution.rsample() # sample from latent diagonal gaussian (reparam trick for gradient) sample = mu + torch.randn_like(mu) * log_sigma.exp() return AttrDict({'mu': mu, 'log_sigma': log_sigma, 'sample': sample})
def resample_and_flatten(vs): old_acseq = vs[0] mean, std = vs[1], vs[2] sample = torch.randn_like(old_acseq) * std + mean d = AttrDict(act=sample) self._env_spec.clip(d, ['act']) return d.act.view([-1] + list(old_acseq.shape[2:]))
def get_output_stats(self): return AttrDict({ 'mu': self._mu_obs.copy(), 'mu_delta': self._mu_delta_obs.copy(), 'sigma': self._sigma_obs.copy(), 'sigma_delta': self._sigma_delta_obs.copy(), })
def _env_step(self, env, dataset, obs, goal): # TODO implement online training action = self._policy.get_action(self._model, obs, goal) next_obs, next_goal, done = env.step(action) self._curr_episode_obs = AttrDict.leaf_combine_and_apply( [self._curr_episode_obs, next_obs], lambda vs: vs[0] + [vs[1]]) self._curr_episode_actions = AttrDict.leaf_combine_and_apply( [self._curr_episode_actions, action], lambda vs: vs[0] + [vs[1]]) self._curr_episode_goals = AttrDict.leaf_combine_and_apply( [self._curr_episode_goals, next_goal], lambda vs: vs[0] + [vs[1]]) self._curr_episode_dones.append(done) if done: dataset.add_episode(self._curr_episode_obs, self._curr_episode_goals, self._curr_episode_actions, self._curr_episode_dones) self._reset_curr_episode() next_obs, next_goal = env.reset() return next_obs, next_goal
def get_goal(self): if self._use_future_goals and self._copter.horizon > 0: goal = self._copter.get_goal().goal_obs[0, 0] next_n = self.trajectory.try_next_n(self._copter.horizon)\ .reshape(self._copter.horizon, self.x_dim) future_goals = np.concatenate([goal[None], next_n], axis=0) return self._env_spec.map_to_types(AttrDict(goal_obs=future_goals[None])) else: return self._copter.get_goal()
def get_action(self, model, observation, goal, batch=False): """Optimizes the cost function provided in setup(). Arguments: model: must be callable(action_sequence, observation, goal) and return cost (torch array) where action is at AttrDict consisting of keys only in self.action_names observation: {} goal: {goal_obs} where goal_obs must be N x H+1 x ... batch: Returns: AttrDict with {action_sequence, results {costs, order} } """ # generate random sequence batch_size = goal.goal_obs.shape[0] # requires goal_obs to be a key device = goal.goal_obs.device if not batch: observation = observation.leaf_apply(lambda arr: arr.unsqueeze( 0).repeat_interleave(self._pop_size, dim=0)) goal = goal.leaf_apply(lambda arr: arr.unsqueeze(0). repeat_interleave(self._pop_size, dim=0)) else: observation = observation.leaf_apply( lambda arr: arr.repeat_interleave(self._pop_size, dim=0)) goal = goal.leaf_apply( lambda arr: arr.repeat_interleave(self._pop_size, dim=0)) action_sequence = self._env_spec.get_uniform( self._action_names, batch_size=batch_size * self._pop_size * self._horizon) action_sequence.leaf_modify(lambda x: split_dim( torch.from_numpy(x).to(device), dim=0, new_shape=[batch_size * self._pop_size, self._horizon])) # run the model results = model(action_sequence, observation, goal) # view as (B, Pop, ...) action_sequence.leaf_modify( lambda x: split_dim(x, 0, [batch_size, self._pop_size])) results.leaf_modify( lambda x: split_dim(x, 0, [batch_size, self._pop_size])) results['order'] = torch.argsort( results.costs, dim=1) # lowest to highest (best to worst) best = results.order[:, :1].unsqueeze(-1).unsqueeze(-1).expand( (-1, -1, self._horizon, self._act_dim)) best_act_seq = action_sequence.leaf_apply( lambda x: torch.gather(x, 1, best)) best_initial_act = best_act_seq.leaf_apply( lambda x: x[:, 0, 0]) # where x is (B, Pop, H ...) return AttrDict(act=best_initial_act, action_sequence=action_sequence, results=results)
def _load_mat(self): local_dict = AttrDict() if self._input_file is None: local_dict = self._env_spec.get_zeros(self._all_names, 0) # np else: logger.debug('Loading ' + self._input_file) samples = scipy.io.loadmat(self._input_file) # split into chunks by episode. dict = {key: list of [Ni, key_shape]} data_dict = split_data_by_episodes(samples, horizon=self._planning_horizon, n_obs=self._obs_history_length, n_acs=self._acs_history_length) self._mu_obs = data_dict['mu_obs'] self._sigma_obs = data_dict['sigma_obs'] self._mu_delta_obs = data_dict['mu_delta_obs'] self._sigma_delta_obs = data_dict['sigma_delta_obs'] self._action_sequences = np.concatenate( data_dict['act_seq'], axis=0).astype(self._env_spec.names_to_dtypes['act']) split_indices = np.cumsum(data_dict['episode_sizes']) # remove the last chunk since it will be empty split_indices = np.delete(split_indices, -1) if self._split_indices.size > 0: self._split_indices = np.concatenate([ self._split_indices, np.array([self._data_len]), self._data_len + split_indices ], axis=0) else: self._split_indices = split_indices self._num_episodes += len(data_dict['done']) self._data_len += np.sum(data_dict['episode_sizes']) logger.debug('Dataset length: {}'.format(self._data_len)) for key in self._all_names: assert key in data_dict, f'{key} not in converted mat file' assert len(data_dict[key]) > 0 # turn list into np array with the correct type local_dict[key] = np.concatenate( data_dict[key], axis=0).astype(self._env_spec.names_to_dtypes[key]) assert local_dict[key].shape[1:] == self._env_spec.names_to_shapes[key], \ "Bad Data shape for {}: {}, requires {}" \ .format(key, local_dict[key].shape[1:], self._env_spec.names_to_shapes[key]) # print(key, self._env_spec.names_to_shapes[key], local_dict[key].shape) assert local_dict[key].shape[0] == self._data_len, \ "Bad datalen for {}: {}, requires {}".format(key, local_dict[key].shape, self._data_len) return local_dict
def eval_policy(dataset, save_file_name): b_size = dataset.batch_size d_size = len(dataset) obs_all = [] goals_all = [] output_actions = [] iters = math.ceil(d_size / b_size) for b in range(iters): logger.debug("[%d/%d]: Eval policy" % (b, iters)) idxs = np.arange(start=b * b_size, stop=min((b + 1) * b_size, d_size)) if args.random_goals: inputs, outputs = dataset.get_batch(indices=idxs, torch_device=model.device, get_horizon_goals=False) # this is to account for broadcasting to H+1 goals goals = env_spec.get_uniform( env_spec.goal_names, b_size, torch_device=model.device).unsqueeze(1) else: inputs, outputs, goals = dataset.get_batch( indices=idxs, torch_device=model.device, get_horizon_goals=True) # get obs batch obs = AttrDict() for name in env_spec.observation_names: obs[name] = inputs[name] act = policy.get_action(model, obs, goals, batch=True) goals_all.append(goals.leaf_apply(lambda v: to_numpy(v))) obs_all.append(obs.leaf_apply(lambda v: to_numpy(v))) output_actions.append(act.leaf_apply(lambda v: to_numpy(v))) # one big dictionary combined_obs = AttrDict.leaf_combine_and_apply( obs_all, lambda vs: np.concatenate(vs, axis=0)) combined_goals = AttrDict.leaf_combine_and_apply( goals_all, lambda vs: np.concatenate(vs, axis=0)) combined_output_actions = AttrDict.leaf_combine_and_apply( output_actions, lambda vs: np.concatenate(vs, axis=0)) combined_obs.combine(combined_goals) combined_obs.combine(combined_output_actions) logger.debug("Saving Action Sequences") savemat(save_file_name, combined_obs)
def get_action(self, model, observation, goal, batch=False): """Optimizes the cost function provided in setup(). Arguments: model: must be callable(action_sequence, observation, goal) and return cost (torch array) where action is at AttrDict consisting of keys only in self.action_names observation: {} goal: {goal_obs} where goal_obs must be N x H+1 x ... batch: Returns: AttrDict with {action_sequence, results {costs, order} } """ # generate random sequence batch_size = goal.goal_obs.shape[0] # requires goal_obs to be a key device = goal.goal_obs.device if not batch: observation = observation.leaf_apply(lambda arr: arr.unsqueeze( 0).repeat_interleave(self._pop_size, dim=0)) goal = goal.leaf_apply(lambda arr: arr.unsqueeze(0). repeat_interleave(self._pop_size, dim=0)) else: observation = observation.leaf_apply( lambda arr: arr.repeat_interleave(self._pop_size, dim=0)) goal = goal.leaf_apply( lambda arr: arr.repeat_interleave(self._pop_size, dim=0)) action_sequence = self._env_spec.get_uniform( self._action_names, batch_size=batch_size * self._pop_size * self._horizon) action_sequence.leaf_modify(lambda x: split_dim( torch.from_numpy(x).to(device), dim=0, new_shape=[batch_size * self._pop_size, self._horizon])) def resample_and_flatten(vs): old_acseq = vs[0] mean, std = vs[1], vs[2] sample = torch.randn_like(old_acseq) * std + mean d = AttrDict(act=sample) self._env_spec.clip(d, ['act']) return d.act.view([-1] + list(old_acseq.shape[2:])) best_initial_act = None results = None for i in range(self._max_iters): # run the model results = model(action_sequence, observation, goal) # view as (B, Pop, ...) action_sequence.leaf_modify( lambda x: split_dim(x, 0, [batch_size, self._pop_size])) results.leaf_modify( lambda x: split_dim(x, 0, [batch_size, self._pop_size])) results.order = torch.argsort(results.costs, dim=1) # lowest to highest best = results.order[:, :self._num_elites] best = best.unsqueeze(-1).unsqueeze(-1).expand( (-1, -1, self._horizon, self._act_dim)) best_act_seq = action_sequence.leaf_apply( lambda x: torch.gather(x, 1, best)) best_initial_act = best_act_seq.leaf_apply( lambda x: x[:, 0, 0]) # where x is (B, Pop, H ...) means = best_act_seq.leaf_apply(lambda x: x.mean(1, keepdim=True)) stds = best_act_seq.leaf_apply(lambda x: x.std(1, keepdim=True)) if i < self._max_iters - 1: # resampling action_sequence = AttrDict.leaf_combine_and_apply( [action_sequence, means, stds], resample_and_flatten) # act is (B, actdim) best_initial_act.action_sequence = action_sequence # (B*Pop, horizon, actdim) best_initial_act.results = results return best_initial_act
def get_output_stats(self): return AttrDict()
def eval_model(dataset, save_file_name): b_size = dataset.batch_size d_size = len(dataset) pred_trajectories = [] action_sequences = [] true_trajectories = [] costs = [] iters = math.ceil(d_size / b_size) for b in range(iters): logger.debug("[%d/%d]: Eval model" % (b, iters)) idxs = np.arange(start=b * b_size, stop=min((b + 1) * b_size, d_size)) inputs, outputs, goals = dataset.get_batch(indices=idxs, torch_device=model.device, get_horizon_goals=True, get_action_seq=True) # get obs batch obs = AttrDict() for name in env_spec.observation_names: obs[name] = inputs[name] act_seq = AttrDict() act_seq['act'] = inputs['act_seq'] model.eval() all_obs, all_mouts = rollout(env_spec, model, obs, act_seq, policy._advance_obs_fn) # first unsqueezes and then concats all_obs = AttrDict.leaf_combine_and_apply( all_obs, func=lambda vs: torch.cat(vs, dim=1), map_func=lambda arr: arr.unsqueeze(1)) all_mouts = AttrDict.leaf_combine_and_apply( all_mouts, func=lambda vs: torch.cat(vs, dim=1), map_func=lambda arr: arr.unsqueeze(1)) cost_dict = AttrDict( {'costs': policy._cost_fn(all_obs, goals, act_seq, all_mouts)}) true_trajectories.append(goals.leaf_apply(lambda v: to_numpy(v))) pred_trajectories.append(all_obs.leaf_apply(lambda v: to_numpy(v))) action_sequences.append(act_seq.leaf_apply(lambda v: to_numpy(v))) costs.append(cost_dict.leaf_apply(lambda v: to_numpy(v))) # one big dictionary final_dict = AttrDict.leaf_combine_and_apply( true_trajectories, lambda vs: np.concatenate(vs, axis=0)) combined_pred = AttrDict.leaf_combine_and_apply( pred_trajectories, lambda vs: np.concatenate(vs, axis=0)) combined_acts = AttrDict.leaf_combine_and_apply( action_sequences, lambda vs: np.concatenate(vs, axis=0)) combined_costs = AttrDict.leaf_combine_and_apply( costs, lambda vs: np.concatenate(vs, axis=0)) final_dict.combine(combined_pred) final_dict.combine(combined_acts) # no overlapping keys final_dict.combine(combined_costs) logger.debug("Saving Model Trajectories") logger.debug("Keys: " + str(final_dict.keys())) savemat(save_file_name, final_dict)
def get_goal(self): goal = AttrDict(goal_obs=np.tile(self._curr_goal_pos[None, None], (1, self.horizon + 1, 1))) return self._env_spec.map_to_types(goal)
def get_obs(self): obs = AttrDict(obs=self._obs[None].copy(), prev_obs=self._prev_obs[None].copy(), prev_act=self._prev_act[None].copy(), latent=-np.ones((1, 1))) # -1 specifies online return self._env_spec.map_to_types(obs)