def __init__(self, encoder, decoder, latent_dim, step_dim, obs_dim, act_dim, policy, env, optimizer=None, loss_type='mse', init_kl_weight=.001, max_kl_weight=.1, kl_mul=1.07, vae_loss_weight=1, lr=1e-3, bc_weight=100, ego=False, egoidx=None): self.encoder = encoder self.obs_dim = obs_dim self.act_dim = act_dim self.decoder = decoder self.ego = ego self.egoidx = egoidx self.bc_weight = bc_weight self.env = env() self.policy = policy self.unit_n = Normal(Variable(torch.zeros(1, latent_dim)), log_var=Variable(torch.zeros(1, latent_dim))) self.latent_dim = latent_dim self.step_dim = step_dim self.init_kl_weight = init_kl_weight self.max_kl_weight = max_kl_weight self.kl_mul = kl_mul if optimizer is None: optimizer = Adam(self.get_params(), lr=lr, eps=1e-5) self.loss_type = loss_type self.vae_loss_weight = vae_loss_weight self.optimizer = optimizer if gpu_enabled(): self.encoder.cuda() self.decoder.cuda()
def compute_loss(self, x, actdata, z, z_dist): y_dist = self.decode(x, z) y = x[:, self.step_dim:].contiguous() xview = x.view((x.size()[0], -1, self.obs_dim)).clone() if self.ego: xview[:, :, self.egoidx] -= xview[:, [0], self.egoidx].unsqueeze(1) zexpand = z.unsqueeze(1).expand(*xview.size()[:2], z.size()[-1]) xz = torch.cat(( Variable(xview), zexpand), -1) xz_view = xz.view((-1, xz.size()[-1])) if self.policy.recurrent(): self.policy.reset(x.size()[0]) xz_view = xz.transpose(0, 1) actdata = actdata.view((actdata.size()[0], -1, self.act_dim)) actdata = actdata.transpose(0, 1).contiguous() dist = self.policy.forward(xz_view) act_view = actdata.view((-1, self.act_dim)) bcloss = -dist.log_likelihood(Variable(act_view)) if self.policy.recurrent(): bcloss = bcloss.view(*actdata.size()[:2]).mean(0) else: bcloss = bcloss.view((actdata.size()[0], -1)).mean(1) mse, neg_ll, kl = self.loss(Variable(y), y_dist, z_dist) return mse, neg_ll, kl, bcloss, z_dist
def test(self, dataset): data = FloatTensor(dataset.train_data) x, actdata = self.splitobs(data) y = x[:, self.step_dim:] z_dist = self.encode(Variable(x)) z = z_dist.sample() y_dist = self.decode(x, z) log_likelihood = torch.pow(y_dist.mle - Variable(y), 2).mean(-1).mean(0) return get_numpy(log_likelihood).item()
def decode(self, x, z): if self.decoder.recurrent(): initial_input = x[:, :self.step_dim].contiguous().clone() if self.ego: diff = initial_input[:, self.egoidx].clone() initial_input[:, self.egoidx] = 0 output = self.decoder.forward(z, initial_input=Variable(initial_input)) if self.ego: bs = output.mean.shape[0] mean = output.mean.view((bs, -1, self.step_dim)) mean[:, :, self.egoidx] += Variable(diff[:, None]) output.mean = mean.view(output.mean.shape) return output else: return self.decoder.forward(z)
def rollout(self, max_path_length, add_input=None, reset_args=None, volatile=False): sd = dict(obs=[], rewards=[], actions=[], action_dist_lst=[], states=[]) obs = self.reset_envs(reset_args) self.policy.reset(obs.shape[0]) for s in range(max_path_length): state = self.policy.get_state().data if self.policy.recurrent() else None if self.ego: obs_ego = obs.copy() obs_ego[:, self.egoidx] -= reset_args[:, self.egoidx] policy_input = Variable(from_numpy(obs_ego).float(), volatile=volatile) else: policy_input = Variable(from_numpy(obs).float(), volatile=volatile) if add_input is not None: policy_input = torch.cat([policy_input, add_input], -1) action_dist = self.policy.forward(policy_input) action = action_dist.sample() if self.random_action_p > 0: flip = np.random.binomial(1, self.random_action_p, size=len(obs)) if flip.sum() > 0: random_act = np.random.randint(0, self.policy.output_dim, size=flip.sum()) action[from_numpy(flip).byte()] = from_numpy(random_act) next_obs, rewards, done, info = self.step_envs(get_numpy(action)) #env_step = self.step_envs(get_numpy(action)) #next_obs = [x[0] for x in env_step] sd['obs'].append(obs) sd['rewards'].append(rewards) sd['actions'].append(action) sd['action_dist_lst'].append(action_dist) sd['states'].append(state) obs = next_obs # Append last obs sd['obs'].append(obs) sd['obs'] = np.stack(sd['obs'], 1) # (bs, max_path_length, obs_dim) #import pdb; pdb.set_trace() sd['states'] = torch.stack(sd['states'], 2) if self.policy.recurrent() else None sd['rewards'] = np.stack(sd['rewards'], 1) # (bs, max_path_length) sd['actions'] = torch.stack(sd['actions'], 1) sd['action_dist'] = sd['action_dist_lst'][0].combine(sd['action_dist_lst'], torch.stack, axis=1) return sd
def rollout(env, policy, max_path_length, add_input=None, volatile=False, reset_args = None): sd = dict(obs=[], rewards=[], actions=[], action_dist_lst=[]) obs = env.reset(reset_args) for s in range(max_path_length): policy_input = Variable(from_numpy(np.array([obs])).float(), volatile=volatile) if add_input is not None: policy_input = torch.cat([policy_input, add_input], -1) if s == 0: policy.reset(1) if policy.recurrent(): policy_input = policy_input.unsqueeze(0) action_dist = policy.forward(policy_input) action = action_dist.sample() x = env.step(get_numpy(action)) next_obs = x[0] sd['obs'].append(obs) sd['rewards'].append(x[1]) sd['actions'].append(action) obs = next_obs sd['obs'].append(obs) sd['obs'] = np.array(sd['obs']) # (bs, max_path_length, obs_dim) sd['rewards'] = np.array(sd['rewards']) # (bs, max_path_length) sd['actions'] = torch.stack(sd['actions'], 1) return sd
def forward_batch(self, batch): obsdata, actdata = self.splitobs(batch) x = obsdata z_dist = self.encode(Variable(x)) z = z_dist.sample() return self.compute_loss(x, actdata, z, z_dist)
def rollout(self, max_path_length, add_input=None, volatile=False): sd = dict(obs=[], rewards=[], actions=[], action_dist_lst=[]) obs = self.envs.reset() self.policy.reset(len(obs)) for s in range(max_path_length): policy_input = Variable(from_numpy(np.stack(obs)).float(), volatile=volatile) if add_input is not None: policy_input = torch.cat([policy_input, add_input], -1) action_dist = self.policy.forward(policy_input) action = action_dist.sample() if self.random_action_p > 0: flip = np.random.binomial(1, self.random_action_p, size=len(obs)) if flip.sum() > 0: random_act = np.random.randint(0, int(self.env.action_space.flat_dim), size=flip.sum()) action[from_numpy(flip).byte()] = from_numpy(random_act) next_obs, rewards, done, info = self.envs.step(get_numpy(action)) sd['obs'].append(obs) sd['rewards'].append(rewards) sd['actions'].append(action) sd['action_dist_lst'].append(action_dist) obs = next_obs # Append last obs sd['obs'].append(obs) sd['obs'] = np.stack(sd['obs'], 1) # (bs, max_path_length, obs_dim) sd['rewards'] = np.stack(sd['rewards'], 1) # (bs, max_path_length) sd['actions'] = torch.stack(sd['actions'], 1) sd['action_dist'] = sd['action_dist_lst'][0].combine(sd['action_dist_lst'], torch.stack, axis=1) return sd
def predict(self, obs_np): bs, path_len, obs_dim = obs_np.shape obs = obs_np.reshape(-1, obs_dim) if self._coeffs is None: return Variable(torch.zeros((bs, path_len))) returns = self._features(obs).dot(self._coeffs).reshape((-1, path_len)) return np_to_var(returns)
def train_pd_match_sd(self, dataset, bs, itr, outer_itr): sampler = self.sampler expert_traj, _ = dataset.sample(bs) # sample from dataset to initialize trajectory from x, actdata = self.vae.splitobs(FloatTensor(expert_traj)) z = Variable(torch.randn((bs, self.latent_dim))) pd_traj, sd_traj = self.forward(sampler, x, z) sd_traj_obs = get_numpy(sd_traj.mle) traj_3d_shape = (bs, -1, self.obs_dim) pd_traj_obs = np_to_var(pd_traj['obs'][:, 1:]) se = sd_traj.reshape(traj_3d_shape).log_likelihood(pd_traj_obs) mse_sd_pd = self.compute_traj_mse(pd_traj_obs, sd_traj.mle, traj_3d_shape) pd_traj['rewards'] = get_numpy(se) self.policy_algo.process_samples(0, pd_traj, augment_obs=get_numpy(z)) self.policy_algo.optimize_policy(0, pd_traj) traj_sets = [sd_traj_obs, pd_traj['obs'][:, 1:]] pd_traj['stats']['mse_sd_pd'] = get_numpy(mse_sd_pd.mean()).item() pd_traj['stats']['ll'] = np.mean(get_numpy(se)) return pd_traj['stats']
def fit(self, obs_np, returns_np): self.network.apply(xavier_init) bs, path_len, obs_dim = obs_np.shape obs = from_numpy(obs_np.reshape(-1, obs_dim).astype(np.float32)) returns = from_numpy(returns_np.reshape(-1).astype(np.float32)) dataloader = DataLoader(TensorDataset(obs, returns), batch_size=self.batch_size, shuffle=True) for epoch in range(self.max_epochs): for x, y in dataloader: self.optimizer.zero_grad() x = Variable(x) y = Variable(y).float().view(-1, 1) loss = (self.network(x) - y).pow(2).mean() loss.backward() self.optimizer.step() print('loss %f' % get_numpy(loss).item())
def log_likelihood(self, x): # x is (bs, path_len * sum(cat_sizes)) one hots bs = x.size()[0] x_3d = x.view(bs, self.path_len, sum(self.cat_sizes)) count = 0 total_ll = Variable(torch.zeros(1)) for cat_size in self.cat_sizes: prob = self.probs_3d[:, :, count:count + cat_size] onehot = x_3d[:, :, count:count + cat_size] ll = torch.log(torch.sum(prob * onehot, -1) + EPS) total_ll += ll.sum() count += cat_size return total_ll / bs
def plot_compare(self, dataset, itr, save_dir='trajs'): x = FloatTensor(dataset.sample_hard(5)[0]) x, actdata = self.splitobs(x) target = x[:, self.step_dim:] y_dist = self.decode(x, self.encode(Variable(x)).sample()) traj_sets = [dataset.unnormalize(get_numpy(traj_set)) for traj_set in [target, y_dist.mle]] traj_names = ['expert', 'sd'] plot_traj_sets([dataset.process(traj_set) for traj_set in traj_sets], traj_names, itr, env_id=dataset.env_id) for traj_no in range(5): dataset.plot_pd_compare([x[traj_no, ...] for x in traj_sets], traj_names, itr, name='Full_State_%d' % traj_no, save_dir=save_dir)
def update_opt(self, f, target, inputs, reg_coeff): self.target = target self.reg_coeff = reg_coeff params = target.get_params() constraint_grads = autograd.backward(f, params) for idx, (grad, param) in enumerate(zip(constraint_grads, params)): if grad is None: constraint_grads[idx] = Variable(torch.zeros(param.size()), requires_grad=True) def Hx_plain(xs): Hx_plain_splits = autograd.backward( torch.sum( torch.stack([ torch.sum(g * x) for g, x in zip(constraint_grads, xs) ])), params) for idx, (Hx, param) in enumerate(zip(Hx_plain_splits, params)): if Hx is None: Hx_plain_splits[idx] = torch.zeros_like(param) return [x.view(-1) for x in Hx_plain_splits] self.f_Hx_plain = Hx_plain
def log_likelihood_full(self, x): # x is (bs, path_len, sum(cat_sizes)) one hots # probs = self.probs_3d * x # # return torch.log(torch.sum(self.probs_3d * x, -1) + EPS) bs = x.size()[0] x_3d = x.view(bs, self.path_len, sum(self.cat_sizes)) count = 0 total_ll = Variable(torch.zeros(bs, self.path_len)) for cat_size in self.cat_sizes: prob = self.probs_3d[:, :, count:count + cat_size] onehot = x_3d[:, :, count:count + cat_size] #ll = torch.log(torch.sum(prob * onehot, -1) + EPS) #total_ll += ll #ll = torch.sum(prob * onehot, -1) + EPS _, p_idx = prob.max(-1) _, x_idx = onehot.max(-1) #import pdb; pdb.set_trace() ll = -torch.pow((p_idx - x_idx).float(), 2) total_ll += ll count += cat_size return total_ll
def init_hidden(self, bs): self.hidden = (Variable(torch.zeros(self.h_size, bs, self.hidden_dim)), Variable(torch.zeros(self.h_size, bs, self.hidden_dim)))
def init_input(self, bs): return Variable(torch.zeros(bs, self.output_dim))
def predict(self, obs_np): bs, path_len, obs_dim = obs_np.shape return Variable(torch.zeros(bs, path_len))
def forward(self, obs_np): #import pdb; pdb.set_trace() bs, obs_dim = obs_np.size() return Variable(torch.zeros(bs))
def init_input(self, bs): # Return one hot for each cat return Variable(torch.zeros(bs, self.output_dim))
def optimize_policy(self, itr, samples_data, add_input_fn=None, add_input_input=None, add_loss_fn=None, print=True): advantages = from_numpy(samples_data['discount_adv'].astype( np.float32)) n_traj = samples_data['obs'].shape[0] n_obs = n_traj * self.max_path_length #add_input_obs = from_numpy(samples_data['obs'][:, :, :self.obs_dim].astype(np.float32)).view(n_traj, -1) if add_input_fn is not None: obs = from_numpy(samples_data['obs'] [:, :self.max_path_length, :self.obs_dim].astype( np.float32)).view(n_obs, -1) else: obs = from_numpy( samples_data['obs'][:, :self.max_path_length, :].astype( np.float32)).view(n_obs, -1) #obs = from_numpy(samples_data['obs'][:, :self.max_path_length, :].astype(np.float32)).view(n_obs, -1) actions = samples_data['actions'].view(n_obs, -1).data returns = from_numpy(samples_data['discount_returns'].copy()).view( -1, 1).float() old_action_log_probs = samples_data['log_prob'].view(n_obs, -1).data states = samples_data['states'].view( samples_data['states'].size()[0], n_obs, -1) if self.policy.recurrent() else None for epoch_itr in range(self.epoch): sampler = BatchSampler(SubsetRandomSampler(range(n_obs)), self.ppo_batch_size, drop_last=False) for indices in sampler: indices = LongTensor(indices) obs_batch = Variable(obs[indices]) actions_batch = actions[indices] return_batch = returns[indices] old_action_log_probs_batch = old_action_log_probs[indices] if states is not None: self.policy.set_state(Variable(states[:, indices])) if add_input_fn is not None: add_input_dist = add_input_fn(Variable(add_input_input)) add_input = add_input_dist.sample() add_input_rep = torch.unsqueeze(add_input, 1).repeat( 1, self.max_path_length, 1).view(n_obs, -1) #add_input_batch = add_input[indices/add_input.size()[0]] add_input_batch = add_input_rep[indices] obs_batch = torch.cat([obs_batch, add_input_batch], -1) values = self.baseline.forward(obs_batch.detach()) action_dist = self.policy.forward(obs_batch) action_log_probs = action_dist.log_likelihood( Variable(actions_batch)).unsqueeze(-1) dist_entropy = action_dist.entropy().mean() ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) adv_targ = Variable(advantages.view(-1, 1)[indices]) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() self.optimizer.zero_grad() total_loss = (value_loss + action_loss - dist_entropy * self.entropy_bonus) if add_loss_fn is not None: total_loss += add_loss_fn(add_input_dist, add_input, add_input_input) total_loss.backward() self.optimizer.step() if print: stats = { 'total loss': get_numpy(total_loss)[0], 'action loss': get_numpy(action_loss)[0], 'value loss': get_numpy(value_loss)[0], 'entropy': get_numpy(dist_entropy)[0] } with logger.prefix('Train PPO itr %d epoch itr %d | ' % (itr, epoch_itr)): self.print_diagnostics(stats) return total_loss
def sample(self, dataset, sample_size): trajs, _ = dataset.sample(sample_size) latent = Variable(torch.randn((sample_size, self.latent_dim))) return self.decode(FloatTensor(trajs), latent), latent
def sample(self, deterministic=False): if deterministic: return self.mean else: return Variable(torch.randn(self.mean.size())) * torch.exp(self.log_var) + self.mean