def get_action(self, obs): obs, goals, taus = split_flat_obs(obs[None], self.env.observation_space.low.size, self.env.goal_dim) sampled_actions = self.sample_actions() first_sampled_actions = sampled_actions.copy() actions = ptu.np_to_var(sampled_actions) next_obs = self.expand_np_to_var(obs[0]) goals = self.expand_np_to_var(goals[0]) taus = self.expand_np_to_var(taus[0]) costs = 0 for i in range(self.mpc_horizon): curr_obs = next_obs if i > 0: sampled_actions = self.sample_actions() actions = ptu.np_to_var(sampled_actions) flat_obs = merge_into_flat_obs( curr_obs, goals, taus, ) obs_delta = self.debug_qf(flat_obs, actions, return_internal_prediction=True) next_obs = curr_obs + obs_delta next_features = self.env.convert_obs_to_goals(next_obs) costs += (next_features[:, :7] - goals[:, :7])**2 costs_np = ptu.get_numpy(costs).sum(1) min_i = np.argmin(costs_np) return first_sampled_actions[min_i], {}
def get_batch(self, train=True): dataset = self.train_dataset if train else self.test_dataset ind = np.random.randint(0, len(dataset['z']), self.batch_size) return { 'z': ptu.np_to_var(dataset['z'][ind, :]), 'z_proj': ptu.np_to_var(dataset['z_proj'][ind, :]), }
def next_state(self, state, action): if self.cheat: next_states = self.qf.eval_np( observations=state[None], actions=action[None], goals=state[None], num_steps_left=np.array([[self.num_steps_left]]), return_predictions=True, ) return next_states[0] num_steps_left = ptu.np_to_var(self.num_steps_left * np.ones( (self.sample_size, 1))) obs_dim = state.shape[0] states = self.expand_np_to_var(state) actions = self.expand_np_to_var(action) next_states_np = np.zeros((self.sample_size, obs_dim)) next_states = ptu.np_to_var(next_states_np, requires_grad=True) optimizer = optim.Adam([next_states], self.learning_rate) for _ in range(self.num_optimization_steps): losses = -self.qf( observations=states, actions=actions, goals=next_states, num_steps_left=num_steps_left, ) loss = losses.mean() optimizer.zero_grad() loss.backward() optimizer.step() losses_np = ptu.get_numpy(losses) best_action_i = np.argmin(losses_np) return ptu.get_numpy(next_states[best_action_i, :])
def get_action(self, current_ob, goal, num_steps_left): if (self.replan_every_time_step or self.t_in_plan == self.planning_horizon or self.last_solution is None): if self.dynamic_lm and self.best_obs_seq is not None: error = np.linalg.norm(current_ob - self.best_obs_seq[self.t_in_plan + 1]) self.update_lagrange_multiplier(error) full_solution = self.replan(current_ob, goal) x_torch = ptu.np_to_var(full_solution, requires_grad=True) current_ob_torch = ptu.np_to_var(current_ob) obs, next_obs = self.batchify(x_torch, current_ob_torch) actions = self.tdm_policy( observations=obs, goals=next_obs, num_steps_left=self.num_steps_left_pytorch, ) self.best_action_seq = ptu.get_numpy(actions) self.best_obs_seq = np.array([current_ob] + [ptu.get_numpy(o) for o in next_obs]) self.last_solution = full_solution self.t_in_plan = 0 agent_info = dict( best_action_seq=self.best_action_seq[self.t_in_plan:], best_obs_seq=self.best_obs_seq[self.t_in_plan:], ) action = self.best_action_seq[self.t_in_plan] self.t_in_plan += 1 return action, agent_info
def denormalize(self, v): mean = ptu.np_to_var(self.mean, requires_grad=False) std = ptu.np_to_var(self.std, requires_grad=False) if v.dim() == 2: mean = mean.unsqueeze(0) std = std.unsqueeze(0) return mean + v * std
def pretrain_encoder(encoder, opt): losses = [] for _ in range(1000): x_np, y_np = swirl_data(BS) x = ptu.np_to_var(x_np) y = ptu.np_to_var(y_np) y_hat = encoder.encode(x) loss = ((y_hat - y) ** 2).mean() opt.zero_grad() loss.backward() opt.step() losses.append(loss.data.numpy()) if VERBOSE: x_np, y_np = swirl_data(N_VIS) x = ptu.np_to_var(x_np) y_hat = encoder.encode(x) y_hat_np = y_hat.data.numpy() x_hat_np = t_to_xy(y_hat_np[:, 0]) plt.subplot(2, 1, 1) plt.plot(np.array(losses)) plt.title("Training Loss") plt.subplot(2, 1, 2) plt.plot(x_np[:, 0], x_np[:, 1], '.') plt.plot(x_hat_np[:, 0], x_hat_np[:, 1], '.') plt.title("Samples") plt.legend(["Samples", "Estimates"]) plt.show()
def get_action(self, ob): if self.last_solution is None or not self.warm_start: init_solution = [] for _ in range(self.planning_horizon): init_solution.append( np.repeat(ob[None], self.num_particles, axis=0)) self.last_solution = np.hstack(init_solution) ob = self._expand_np_to_var(ob) actions_np = np.hstack( [self.sample_actions() for _ in range(self.planning_horizon)]) actions = ptu.np_to_var(actions_np) next_states = ptu.np_to_var(self.last_solution, requires_grad=True) optimizer = optim.Adam([next_states], lr=self.learning_rate) for i in range(self.num_grad_steps): constraint_loss = self.constraint_fctn(ob, actions, next_states) optimizer.zero_grad() constraint_loss.sum().backward() optimizer.step() final_loss = (self.cost_function(ob, actions, next_states) + self.lagrange_multiplier * self.constraint_fctn(ob, actions, next_states)) self.last_solution = ptu.get_numpy(next_states) final_loss_np = ptu.get_numpy(final_loss).sum(axis=1) min_i = np.argmin(final_loss_np) action = actions_np[min_i, :self.action_dim] return action, {}
def train_encoder(encoder, decoder, encoder_opt): batch, true_latents = swirl_data(BS) batch = ptu.np_to_var(batch) latents, means, log_stds, stds = encoder.get_encoding_and_suff_stats( batch ) kl = kl_to_prior(means, log_stds, stds) latents = encoder.encode(batch) decoder_output = decoder(latents) decoder_means = decoder_output[:, 0:2] decoder_log_stds = decoder_output[:, 2:4] distribution = Normal(decoder_means, decoder_log_stds.exp()) reconstruction_log_prob = distribution.log_prob(batch).sum(dim=1) # elbo = - kl + reconstruction_log_prob # loss = - elbo.mean() loss = - reconstruction_log_prob.mean() # This is the second place where we cheat: latent_loss = ((ptu.np_to_var(true_latents) - latents) ** 2).mean() loss = loss# + latent_loss encoder_opt.zero_grad() loss.backward() encoder_opt.step() return loss
def _encode(self, imgs, noisy, clip_std=None, batch_size=None): if batch_size is None: mu, logvar = self.vae.encode(ptu.np_to_var(imgs)) else: imgs = imgs.reshape(-1, self.vae.imlength) n = imgs.shape[0] mu, logvar = None, None for i in range(0, n, batch_size): batch_mu, batch_logvar = self.vae.encode( ptu.np_to_var(imgs[i:i + batch_size])) if mu is None: mu = batch_mu logvar = batch_logvar else: mu = torch.cat((mu, batch_mu), dim=0) logvar = torch.cat((logvar, batch_logvar), dim=0) std = logvar.mul(0.5).exp_() if clip_std is None: clip_std = self.clip_encoding_std if clip_std: vae_std = np.copy(self.vae.dist_std) vae_std = ptu.np_to_var(vae_std) std = torch.min(std, vae_std) if noisy: eps = ptu.Variable(std.data.new(std.size()).normal_()) sample = eps.mul(std).add_(mu) else: sample = mu return ptu.get_numpy(sample), ptu.get_numpy(mu), ptu.get_numpy(std)
def get_np_action(self, state_np, goal_state_np): return ptu.get_numpy( self.policy( ptu.np_to_var(np.expand_dims(state_np, 0)), ptu.np_to_var(np.expand_dims(goal_state_np, 0)), self._tau_expanded_torch, ).squeeze(0))
def next_state(self, state, action, goal_state, discount): state = ptu.np_to_var(np.expand_dims(state, 0)) action = ptu.np_to_var(np.expand_dims(action, 0)) goal_state = ptu.np_to_var(np.expand_dims(goal_state, 0)) discount = ptu.np_to_var(np.array([[discount]])) return ptu.get_numpy( self.qf(state, action, goal_state, discount) + state)[0]
def evaluate(x, y): action = np.array([x, y]) action = ptu.np_to_var(action).unsqueeze(0) state = ptu.np_to_var(start_state).unsqueeze(0) goal_states = ptu.np_to_var(goal_state).unsqueeze(0) discount = ptu.np_to_var(np.array([[0]])) out = qf(state, action, goal_states, discount) return out.data.numpy()
def get_action(self, obs): sampled_actions = self.sample_actions() first_sampled_actions = sampled_actions.copy() all_actions_np = [first_sampled_actions] actions = ptu.np_to_var(sampled_actions) next_obs = self.expand_np_to_var(obs) all_obs_torch = [next_obs] costs = 0 all_costs = [] for i in range(self.mpc_horizon): curr_obs = next_obs if i > 0: sampled_actions = self.sample_actions() all_actions_np.append(sampled_actions) actions = ptu.np_to_var(sampled_actions) next_obs = curr_obs + self.dynamics_model(curr_obs, actions) all_obs_torch.append(next_obs) new_costs = self.cost_fn( ptu.get_numpy(curr_obs), ptu.get_numpy(actions), ptu.get_numpy(next_obs), ) costs = costs + new_costs all_costs.append(new_costs) # Reward sum of costs or just last time step? # min_i = np.argmin(costs) min_costs = np.array(all_costs).min(0) min_i = np.argmin(min_costs) # For Point2d u-shaped wall # best_action_seq = [action_t[min_i, :] for action_t in all_actions_np] # best_obs_seq = [ # ptu.get_numpy(ob_t[min_i, :]) for ob_t in all_obs_torch # ] # # real_obs_seq = self.env.wrapped_env.wrapped_env.true_states(obs, best_action_seq) # self.ax1.clear() # self.env.wrapped_env.wrapped_env.plot_trajectory( # self.ax1, # np.array(best_obs_seq), # np.array(best_action_seq), # goal=self.env.wrapped_env.wrapped_env._target_position, # ) # self.ax1.set_title("imagined") # self.ax2.clear() # self.env.wrapped_env.wrapped_env.plot_trajectory( # self.ax2, # np.array(real_obs_seq), # np.array(best_action_seq), # goal=self.env.wrapped_env.wrapped_env._target_position, # ) # self.ax2.set_title("real") # plt.draw() # plt.pause(0.001) return first_sampled_actions[min_i], {}
def normalize(self, v, clip_range=None): if clip_range is None: clip_range = self.default_clip_range mean = ptu.np_to_var(self.mean, requires_grad=False) std = ptu.np_to_var(self.std, requires_grad=False) if v.dim() == 2: # Unsqueeze along the batch use automatic broadcasting mean = mean.unsqueeze(0) std = std.unsqueeze(0) return torch.clamp((v - mean) / std, -clip_range, clip_range)
def _realistic_subgoal_reward(self, subgoals, use_double=True): if type(subgoals) is np.ndarray: subgoals = ptu.np_to_var(subgoals, double=use_double) if hasattr(self, "true_prior_distr"): log_prob = self.true_prior_distr.log_prob(subgoals) log_prob = torch.sum(log_prob, dim=-1) return log_prob else: return ptu.np_to_var(np.zeros(subgoals.shape[:-1]))
def get_batch(self, train=True): dataset = self.train_dataset if train else self.test_dataset ind = np.random.randint(0, len(dataset['obs']), self.batch_size) samples_obs = dataset['obs'][ind, :] samples_actions = dataset['actions'][ind, :] samples_next_obs = dataset['next_obs'][ind, :] return { 'obs': ptu.np_to_var(samples_obs), 'actions': ptu.np_to_var(samples_actions), 'next_obs': ptu.np_to_var(samples_next_obs), }
def _np_to_pytorch(self, ob_np, goal_np, taus_np, batch_size=1): ob_np = np.tile(ob_np, (batch_size, 1, 1)) goal_np = np.tile(goal_np, (batch_size, 1, 1)) taus_np = np.tile(taus_np.reshape((1, self.num_subprobs, 1)), (batch_size, 1, 1)) ob = ptu.np_to_var(ob_np, double=self.use_double) goal = ptu.np_to_var(goal_np, double=self.use_double) taus = ptu.np_to_var(taus_np, double=self.use_double) return ob, goal, taus
def _action_cost(self, x, current_ob, goal): x = ptu.np_to_var(x, requires_grad=True) actions = x.unsqueeze(0) current_obs = ptu.np_to_var(current_ob[None]) goals = ptu.np_to_var(goal[None]) num_steps_left = ptu.np_to_var(np.zeros((1,1))) prob_reach = self.beta_q(current_obs, actions, goals, num_steps_left) loss = - prob_reach loss_np = ptu.get_numpy(prob_reach)[0].astype(np.float64) loss.backward() gradient_np = ptu.get_numpy(x.grad).astype(np.float64) return loss_np, gradient_np
def __init__(self, representation_size, input_size, hidden_sizes=list([64, 128, 64]), init_w=1e-3, hidden_init=ptu.fanin_init, output_activation=identity, output_scale=1, layer_norm=False, normalize=True, train_data_mean=None, train_data_std=None, **kwargs): self.save_init_params(locals()) super().__init__() self.representation_size = representation_size self.hidden_init = hidden_init self.output_activation = output_activation self.dist_mu = np.zeros(self.representation_size) self.dist_std = np.ones(self.representation_size) self.relu = nn.ReLU() self.sigmoid = nn.Sigmoid() self.init_w = init_w hidden_sizes = list(hidden_sizes) self.input_size = input_size self.encoder = TwoHeadMlp(hidden_sizes, representation_size, representation_size, input_size, layer_norm=layer_norm, hidden_init=hidden_init, output_activation=output_activation, init_w=init_w) hidden_sizes.reverse() self.decoder = Mlp(hidden_sizes, input_size, representation_size, layer_norm=layer_norm, hidden_init=hidden_init, output_activation=output_activation, init_w=init_w) self.output_scale = output_scale self.normalize = normalize if train_data_mean is None: self.train_data_mean = ptu.np_to_var(np.zeros(input_size)) else: self.train_data_mean = train_data_mean if train_data_std is None: self.train_data_std = ptu.np_to_var(np.ones(input_size)) else: self.train_data_std = train_data_std
def __init__( self, train_dataset, test_dataset, model, batch_size=128, beta=0.5, beta_schedule=None, lr=1e-3, extra_recon_logging=dict(), recon_weights=None, recon_loss_type='mse', **kwargs ): assert recon_loss_type in ['mse', 'wse'] self.batch_size = batch_size self.beta = beta self.beta_schedule = beta_schedule if self.beta_schedule is None: self.beta_schedule = ConstantSchedule(self.beta) if ptu.gpu_enabled(): model.cuda() self.model = model self.representation_size = model.representation_size self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.train_dataset, self.test_dataset = train_dataset, test_dataset assert self.train_dataset['next_obs'].dtype == np.float32 assert self.test_dataset['next_obs'].dtype ==np.float32 assert self.train_dataset['obs'].dtype == np.float32 assert self.test_dataset['obs'].dtype == np.float32 self.normalize = model.normalize self.mse = nn.MSELoss() if self.normalize: self.train_data_mean = ptu.np_to_var(np.mean(self.train_dataset['next_obs'], axis=0)) np_std = np.std(self.train_dataset['next_obs'], axis=0) for i in range(len(np_std)): if np_std[i] < 1e-3: np_std[i] = 1.0 self.train_data_std = ptu.np_to_var(np_std) self.model.train_data_mean = self.train_data_mean self.model.train_data_std = self.train_data_std self.extra_recon_logging = extra_recon_logging self.recon_weights = recon_weights self.recon_loss_type = recon_loss_type
def cost_function(self, x, current_ob): self.forward -= time.time() x = ptu.np_to_var(x, requires_grad=True) current_ob = ptu.np_to_var(current_ob) loss = (self.lagrange_multipler * self._feasibility_cost_function(x, current_ob) + self._env_cost_function(x, current_ob)) loss_np = ptu.get_numpy(loss)[0].astype(np.float64) self.forward += time.time() self.backward -= time.time() loss.squeeze(0).backward() gradient_np = ptu.get_numpy(x.grad).astype(np.float64) self.backward += time.time() return loss_np, gradient_np
def dump_reconstructions(vae_env, epoch, n_recon=16): from railrl.core import logger import os.path as osp from torchvision.utils import save_image if vae_env.use_vae_dataset and vae_env.vae_dataset_path is not None: from multiworld.core.image_env import normalize_image from railrl.misc.asset_loader import local_path_from_s3_or_local_path filename = local_path_from_s3_or_local_path(vae_env.vae_dataset_path) dataset = np.load(filename).item() sampled_idx = np.random.choice(dataset['next_obs'].shape[0], n_recon) if vae_env.vae_input_key_prefix == 'state': states = dataset['next_obs'][sampled_idx] imgs = ptu.np_to_var( vae_env.wrapped_env.states_to_images(states) ) recon_samples, _, _ = vae_env.vae(ptu.np_to_var(states)) recon_imgs = ptu.np_to_var( vae_env.wrapped_env.states_to_images(ptu.get_numpy(recon_samples)) ) else: imgs = ptu.np_to_var( normalize_image(dataset['next_obs'][sampled_idx]) ) recon_imgs, _, _, _ = vae_env.vae(imgs) del dataset else: return comparison = torch.cat([ imgs.narrow(start=0, length=vae_env.wrapped_env.image_length, dimension=1).contiguous().view( -1, vae_env.wrapped_env.channels, vae_env.wrapped_env.imsize, vae_env.wrapped_env.imsize ), recon_imgs.contiguous().view( n_recon, vae_env.wrapped_env.channels, vae_env.wrapped_env.imsize, vae_env.wrapped_env.imsize )[:n_recon] ]) if epoch is not None: save_dir = osp.join(logger.get_snapshot_dir(), 'r_%d.png' % epoch) else: save_dir = osp.join(logger.get_snapshot_dir(), 'r.png') save_image(comparison.data.cpu(), save_dir, nrow=n_recon)
def _cost_function(self, x, order): x = ptu.np_to_var(x, requires_grad=True) loss = 0 for action, next_state in self.split(x): next_features_predicted = next_state[self.goal_slice] desired_features = ptu.np_to_var( self.env.multitask_goal[self.multitask_goal_slice] * np.ones(next_features_predicted.shape)) diff = next_features_predicted - desired_features loss += (diff**2).sum() if order == 0: return ptu.get_numpy(loss)[0] elif order == 1: loss.squeeze(0).backward() return ptu.get_numpy(x.grad)
def get_loss(training=False): buffer = replay_buffer.get_replay_buffer(training) batch = buffer.random_batch(batch_size) obs = ptu.np_to_var(batch['observations'], requires_grad=False) goals = ptu.np_to_var(batch['goal_states'], requires_grad=False) goal = goal_chooser(obs, goals) actions = argmax_q(obs, goal, discount) final_state_predicted = goal_conditioned_model( obs, actions, goal, discount, ) + obs rewards = goal_chooser.reward_function(final_state_predicted, goals) return -rewards.mean()
def choose_action_to_reach_adam(self, current_ob, goal): n_parts = 100 x0 = np.vstack([ self.env.action_space.sample() for _ in range(n_parts) ]) current_obs = ptu.np_to_var(current_ob).unsqueeze(0).repeat(n_parts, 1) goals = ptu.np_to_var(goal).unsqueeze(0).repeat(n_parts, 1) num_steps_left = ptu.np_to_var(np.zeros((n_parts, 1))) best_action, _ = fmin_adam_torch( self._action_cost_batch, x0, f_args=(current_obs, goals, num_steps_left), ) return best_action
def get_action(self, obs): action_inits = self.sample_actions() actions = ptu.np_to_var(action_inits, requires_grad=True) obs = self.expand_np_to_var(obs) optimizer = optim.Adam([actions], self.learning_rate) losses = -self.qf( obs, actions, self._goal_batch, self._tau_batch, ) for _ in range(self.num_gradient_steps): loss = losses.mean() optimizer.zero_grad() loss.backward() optimizer.step() losses = -self.qf( obs, actions, self._goal_batch, self._tau_batch, ) losses_np = ptu.get_numpy(losses) best_action_i = np.argmin(losses_np) return ptu.get_numpy(actions[best_action_i, :]), {}
def replan(self, current_ob, goal): if self.last_solution is None or not self.warm_start: solution = [] for i in range(self.planning_horizon): solution.append(current_ob) self.last_solution = np.hstack(solution) self.desired_features_torch = ptu.np_to_var(goal[None].repeat( self.planning_horizon, 0)) self.forward = self.backward = 0 start = time.time() x, f, d = optimize.fmin_l_bfgs_b(self.cost_function, self.last_solution, args=(current_ob, ), bounds=self.bounds, **self.solver_kwargs) total = time.time() - start self.totals.append(total) # print("total forward: {}".format(self.forward)) # print("total backward: {}".format(self.backward)) # print("total: {}".format(total)) # print("extra: {}".format(total - self.forward - self.backward)) # print("total mean: {}".format(np.mean(self.totals))) warnflag = d['warnflag'] if warnflag != 0: if warnflag == 1: print("too many function evaluations or too many iterations") else: print(d['task']) return x
def simulate_policy(args): ptu.set_gpu_mode(True) model = pickle.load(open(args.file, "rb")) # joblib.load(args.file) model.to(ptu.device) imgs = np.load(args.imgfile) import ipdb ipdb.set_trace() z = model.encode(ptu.np_to_var(imgs)) samples = model.decode(z).cpu() recon_imgs = samples.data.view(64, model.input_channels, model.imsize, model.imsize) recon_imgs = recon_imgs.cpu() grid = make_grid(recon_imgs, nrow=8) ndarr = grid.mul(255).clamp(0, 255).byte().permute(1, 2, 0).numpy() im = Image.fromarray(ndarr) im.show() # cv2.imshow('img', im) # cv2.waitKey(1) # for sample in samples: # tensor = tensor.cpu() # img = ptu.get_numpy(tensor) comparison = torch.cat([ recon_imgs, imgs, ]) save_dir = osp.join(logger.get_snapshot_dir(), 'r%d.png' % epoch) save_image(comparison.data.cpu(), save_dir, nrow=n)
def replan(self, current_ob, goal): if self.last_solution is None or not self.warm_start: solution = [] for i in range(self.planning_horizon): solution.append(np.zeros(self.action_dim)) solution.append(current_ob) self.last_solution = np.hstack(solution) self.desired_features_torch = ptu.np_to_var( goal[None].repeat(self.planning_horizon, 0) ) self.forward = self.backward = 0 start = time.time() x, f, d = optimize.fmin_l_bfgs_b( self.cost_function, self.last_solution, args=(current_ob,), bounds=self.bounds, **self.solver_kwargs ) self.t1 = np.array([ 1, 0, 1, 0, 1, 0, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, ]) self.t2 = np.array([ 1, 0, 2, 0, 0, 1, 2, 1, 0, 1, 2, 2, -1, 1, 1, 3, ]) total = time.time() - start self.totals.append(total) process_lbfgs_debug_dict(d) return x
def fmin_adam_torch( batch_torch_f, x0_np, f_args=None, f_kwargs=None, lr=1e-3, num_steps=100, ): if f_args is None: f_args = tuple() if f_kwargs is None: f_kwargs = {} x = ptu.np_to_var(x0_np, requires_grad=True) optimizer = Adam([x], lr=lr) for _ in range(num_steps): loss = batch_torch_f(x, *f_args, **f_kwargs).sum() optimizer.zero_grad() loss.backward() optimizer.step() final_values_np = ptu.get_numpy(batch_torch_f(x, *f_args, **f_kwargs)) final_x_np = ptu.get_numpy(x) min_i = np.argmin(final_values_np) return final_x_np[min_i], final_values_np[min_i]