def rollout(self, max_path_length, add_input=None, volatile=False): sd = dict(obs=[], rewards=[], actions=[], action_dist_lst=[]) obs = self.envs.reset() self.policy.reset(len(obs)) for s in range(max_path_length): policy_input = Variable(from_numpy(np.stack(obs)).float(), volatile=volatile) if add_input is not None: policy_input = torch.cat([policy_input, add_input], -1) action_dist = self.policy.forward(policy_input) action = action_dist.sample() if self.random_action_p > 0: flip = np.random.binomial(1, self.random_action_p, size=len(obs)) if flip.sum() > 0: random_act = np.random.randint(0, int(self.env.action_space.flat_dim), size=flip.sum()) action[from_numpy(flip).byte()] = from_numpy(random_act) next_obs, rewards, done, info = self.envs.step(get_numpy(action)) sd['obs'].append(obs) sd['rewards'].append(rewards) sd['actions'].append(action) sd['action_dist_lst'].append(action_dist) obs = next_obs # Append last obs sd['obs'].append(obs) sd['obs'] = np.stack(sd['obs'], 1) # (bs, max_path_length, obs_dim) sd['rewards'] = np.stack(sd['rewards'], 1) # (bs, max_path_length) sd['actions'] = torch.stack(sd['actions'], 1) sd['action_dist'] = sd['action_dist_lst'][0].combine(sd['action_dist_lst'], torch.stack, axis=1) return sd
def setup_dataloader(self): data = from_numpy(self.train_data.astype(np.float32)) target = from_numpy(self.train_target.astype(np.float32)) assert data.shape[ 0] >= self.batch_size, "Data size must be greater than batch size" return DataLoader(TensorDataset(data, target), batch_size=self.batch_size, shuffle=True, drop_last=True)
def rollout(self, max_path_length, add_input=None, reset_args=None, volatile=False): sd = dict(obs=[], rewards=[], actions=[], action_dist_lst=[], states=[]) obs = self.reset_envs(reset_args) self.policy.reset(obs.shape[0]) for s in range(max_path_length): state = self.policy.get_state().data if self.policy.recurrent() else None if self.ego: obs_ego = obs.copy() obs_ego[:, self.egoidx] -= reset_args[:, self.egoidx] policy_input = Variable(from_numpy(obs_ego).float(), volatile=volatile) else: policy_input = Variable(from_numpy(obs).float(), volatile=volatile) if add_input is not None: policy_input = torch.cat([policy_input, add_input], -1) action_dist = self.policy.forward(policy_input) action = action_dist.sample() if self.random_action_p > 0: flip = np.random.binomial(1, self.random_action_p, size=len(obs)) if flip.sum() > 0: random_act = np.random.randint(0, self.policy.output_dim, size=flip.sum()) action[from_numpy(flip).byte()] = from_numpy(random_act) next_obs, rewards, done, info = self.step_envs(get_numpy(action)) #env_step = self.step_envs(get_numpy(action)) #next_obs = [x[0] for x in env_step] sd['obs'].append(obs) sd['rewards'].append(rewards) sd['actions'].append(action) sd['action_dist_lst'].append(action_dist) sd['states'].append(state) obs = next_obs # Append last obs sd['obs'].append(obs) sd['obs'] = np.stack(sd['obs'], 1) # (bs, max_path_length, obs_dim) #import pdb; pdb.set_trace() sd['states'] = torch.stack(sd['states'], 2) if self.policy.recurrent() else None sd['rewards'] = np.stack(sd['rewards'], 1) # (bs, max_path_length) sd['actions'] = torch.stack(sd['actions'], 1) sd['action_dist'] = sd['action_dist_lst'][0].combine(sd['action_dist_lst'], torch.stack, axis=1) return sd
def rollout(env, policy, max_path_length, add_input=None, volatile=False, reset_args = None): sd = dict(obs=[], rewards=[], actions=[], action_dist_lst=[]) obs = env.reset(reset_args) for s in range(max_path_length): policy_input = Variable(from_numpy(np.array([obs])).float(), volatile=volatile) if add_input is not None: policy_input = torch.cat([policy_input, add_input], -1) if s == 0: policy.reset(1) if policy.recurrent(): policy_input = policy_input.unsqueeze(0) action_dist = policy.forward(policy_input) action = action_dist.sample() x = env.step(get_numpy(action)) next_obs = x[0] sd['obs'].append(obs) sd['rewards'].append(x[1]) sd['actions'].append(action) obs = next_obs sd['obs'].append(obs) sd['obs'] = np.array(sd['obs']) # (bs, max_path_length, obs_dim) sd['rewards'] = np.array(sd['rewards']) # (bs, max_path_length) sd['actions'] = torch.stack(sd['actions'], 1) return sd
def fit(self, obs_np, returns_np): self.network.apply(xavier_init) bs, path_len, obs_dim = obs_np.shape obs = from_numpy(obs_np.reshape(-1, obs_dim).astype(np.float32)) returns = from_numpy(returns_np.reshape(-1).astype(np.float32)) dataloader = DataLoader(TensorDataset(obs, returns), batch_size=self.batch_size, shuffle=True) for epoch in range(self.max_epochs): for x, y in dataloader: self.optimizer.zero_grad() x = Variable(x) y = Variable(y).float().view(-1, 1) loss = (self.network(x) - y).pow(2).mean() loss.backward() self.optimizer.step() print('loss %f' % get_numpy(loss).item())
def __init__(self, input_dim, output_dim, init): super(Parameter, self).__init__() self.output_dim = output_dim self.init = init self.param_init = from_numpy(np.zeros((1, output_dim)) + init).float() #TODO: fix this nn.Parameter(self.param_init) self.params_var = nn.Parameter( self.param_init ) #torch.autograd.Variable(self.param_init, requires_grad=True)
def optimize_policy(self, itr, samples_data): prev_param = get_numpy(self._target.get_params_flat()) self.policy.zero_grad() loss_before = self.loss(samples_data) loss_before.backward() flat_g = self.policy.get_params_flat() loss_before = get_numpy(loss_before).item() Hx = self._hvp_approach.build_eval(samples_data) descent_direction = krylov.cg(Hx, flat_g, cg_iters=self._cg_iters) initial_step_size = np.sqrt( 2.0 * self._max_constraint_val * (1. / (descent_direction.dot(Hx(descent_direction)) + 1e-8))) if np.isnan(initial_step_size): initial_step_size = 1. flat_descent_step = initial_step_size * descent_direction logger.log("descent direction computed") n_iter = 0 for n_iter, ratio in enumerate(self._backtrack_ratio**np.arange( self._max_backtracks)): cur_step = ratio * flat_descent_step cur_param = prev_param - cur_step self._target.set_params_flat(from_numpy(cur_param)) loss, constraint_val = self.compute_loss_terms(samples_data) if self._debug_nan and np.isnan(constraint_val): import ipdb ipdb.set_trace() if loss < loss_before and constraint_val <= self._max_constraint_val: break if (np.isnan(loss) or np.isnan(constraint_val) or loss >= loss_before or constraint_val >= self._max_constraint_val ) and not self._accept_violation: logger.log("Line search condition violated. Rejecting the step!") if np.isnan(loss): logger.log("Violated because loss is NaN") if np.isnan(constraint_val): logger.log("Violated because constraint %s is NaN" % self._constraint_name) if loss >= loss_before: logger.log("Violated because loss not improving") if constraint_val >= self._max_constraint_val: logger.log("Violated because constraint %s is violated" % self._constraint_name) self._target.set_param_values(prev_param, trainable=True) logger.log("backtrack iters: %d" % n_iter) logger.log("computing loss after") logger.log("optimization finished")
def rollout_meta(self, latents, cur_obs, reward_fn, rstate): nbatch = latents.shape[1] state = cur_obs #np.array([cur_obs] * nbatch) trajs = [] for lat in latents: latent_v = np_to_var(lat) state_v = from_numpy(state).float() sd_traj = self.vae.decode(state_v, latent_v) self.vae.decoder.zero_grad() decoded_traj = get_numpy(sd_traj.mle).reshape( (nbatch, -1, cur_obs.shape[1])) state = decoded_traj[:, -1] trajs.append(decoded_traj) combo_traj = np.concatenate(trajs, axis=1) rewards, rstate = self.eval_rewards(combo_traj, reward_fn, rstate, discount=True) return rewards, combo_traj
def optimize_policy(self, itr, samples_data, add_input_fn=None, add_input_input=None, add_loss_fn=None, print=True): advantages = from_numpy(samples_data['discount_adv'].astype( np.float32)) n_traj = samples_data['obs'].shape[0] n_obs = n_traj * self.max_path_length #add_input_obs = from_numpy(samples_data['obs'][:, :, :self.obs_dim].astype(np.float32)).view(n_traj, -1) if add_input_fn is not None: obs = from_numpy(samples_data['obs'] [:, :self.max_path_length, :self.obs_dim].astype( np.float32)).view(n_obs, -1) else: obs = from_numpy( samples_data['obs'][:, :self.max_path_length, :].astype( np.float32)).view(n_obs, -1) #obs = from_numpy(samples_data['obs'][:, :self.max_path_length, :].astype(np.float32)).view(n_obs, -1) actions = samples_data['actions'].view(n_obs, -1).data returns = from_numpy(samples_data['discount_returns'].copy()).view( -1, 1).float() old_action_log_probs = samples_data['log_prob'].view(n_obs, -1).data states = samples_data['states'].view( samples_data['states'].size()[0], n_obs, -1) if self.policy.recurrent() else None for epoch_itr in range(self.epoch): sampler = BatchSampler(SubsetRandomSampler(range(n_obs)), self.ppo_batch_size, drop_last=False) for indices in sampler: indices = LongTensor(indices) obs_batch = Variable(obs[indices]) actions_batch = actions[indices] return_batch = returns[indices] old_action_log_probs_batch = old_action_log_probs[indices] if states is not None: self.policy.set_state(Variable(states[:, indices])) if add_input_fn is not None: add_input_dist = add_input_fn(Variable(add_input_input)) add_input = add_input_dist.sample() add_input_rep = torch.unsqueeze(add_input, 1).repeat( 1, self.max_path_length, 1).view(n_obs, -1) #add_input_batch = add_input[indices/add_input.size()[0]] add_input_batch = add_input_rep[indices] obs_batch = torch.cat([obs_batch, add_input_batch], -1) values = self.baseline.forward(obs_batch.detach()) action_dist = self.policy.forward(obs_batch) action_log_probs = action_dist.log_likelihood( Variable(actions_batch)).unsqueeze(-1) dist_entropy = action_dist.entropy().mean() ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) adv_targ = Variable(advantages.view(-1, 1)[indices]) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() self.optimizer.zero_grad() total_loss = (value_loss + action_loss - dist_entropy * self.entropy_bonus) if add_loss_fn is not None: total_loss += add_loss_fn(add_input_dist, add_input, add_input_input) total_loss.backward() self.optimizer.step() if print: stats = { 'total loss': get_numpy(total_loss)[0], 'action loss': get_numpy(action_loss)[0], 'value loss': get_numpy(value_loss)[0], 'entropy': get_numpy(dist_entropy)[0] } with logger.prefix('Train PPO itr %d epoch itr %d | ' % (itr, epoch_itr)): self.print_diagnostics(stats) return total_loss
def f(flat_params): self.policy.set_params_flat(from_numpy(flat_params)) return self.get_opt_output(samples_data, penalty)
def optimize_policy(self, itr, samples_data): try_penalty = float( np.clip(self._penalty, self._min_penalty, self._max_penalty)) penalty_scale_factor = None def gen_f_opt(penalty): def f(flat_params): self.policy.set_params_flat(from_numpy(flat_params)) return self.get_opt_output(samples_data, penalty) return f cur_params = get_numpy(self.policy.get_params_flat().double()) opt_params = cur_params # Save views of objs for efficiency samples_data['obs_flat_var'] = np_to_var(samples_data['obs_flat']) samples_data['action_dist_flat'] = samples_data['action_dist'].detach( ).reshape((-1, samples_data['action_dist'].dim)) samples_data['actions_flat'] = samples_data['actions'].view( -1, self.action_dim) samples_data['discount_adv_var'] = np_to_var( samples_data['discount_adv']) for penalty_itr in range(self._max_penalty_itr): logger.log('trying penalty=%.3f...' % try_penalty) itr_opt_params, _, _ = scipy.optimize.fmin_l_bfgs_b( func=gen_f_opt(try_penalty), x0=cur_params, maxiter=self._max_opt_itr) _, try_loss, try_constraint_val = self.compute_loss_terms( samples_data, try_penalty) try_loss = get_numpy(try_loss)[0] try_constraint_val = get_numpy(try_constraint_val)[0] logger.log('penalty %f => loss %f, %s %f' % (try_penalty, try_loss, self._constraint_name, try_constraint_val)) if try_constraint_val < self._max_constraint_val or \ (penalty_itr == self._max_penalty_itr - 1 and opt_params is None): opt_params = itr_opt_params if not self._adapt_penalty: break # Decide scale factor on the first iteration, or if constraint violation yields numerical error if penalty_scale_factor is None or np.isnan(try_constraint_val): # Increase penalty if constraint violated, or if constraint term is NAN if try_constraint_val > self._max_constraint_val or np.isnan( try_constraint_val): penalty_scale_factor = self._increase_penalty_factor else: # Otherwise (i.e. constraint satisfied), shrink penalty penalty_scale_factor = self._decrease_penalty_factor opt_params = itr_opt_params else: if penalty_scale_factor > 1 and \ try_constraint_val <= self._max_constraint_val: break elif penalty_scale_factor < 1 and \ try_constraint_val >= self._max_constraint_val: break try_penalty *= penalty_scale_factor try_penalty = float( np.clip(try_penalty, self._min_penalty, self._max_penalty)) self._penalty = try_penalty self.policy.set_params_flat(from_numpy(opt_params))
def train_explorer(self, dataset, test_dataset, dummy_dataset, itr): bs = self.batch_size # load fixed initial state and goals from config init_state = self.block_config[0] goals = np.array(self.block_config[1]) # functions for computing the reward and initializing the reward state (rstate) # rstate is used to keep track of things such as which goal you are currently on reward_fn, init_rstate = self.reward_fn # total actual reward collected by MPC agent so far total_mpc_rew = np.zeros(self.mpc_batch) # keep track of states visited by MPC to initialize the explorer from all_inits = [] # current state of mpc batche cur_state = np.array([init_state] * self.mpc_batch) # initialize the reward state for the mpc batch rstate = init_rstate(self.mpc_batch) # for visualization purposes mpc_preds = [] mpc_actual = [] mpc_span = [] rstates = [] # Perform MPC over max_horizon for T in range(self.max_horizon): print(T) # for goal visulization rstates.append(rstate) # rollout imaginary trajectories using state decoder rollouts = self.mpc(cur_state, min(self.plan_horizon, self.max_horizon - T), self.mpc_explore, self.mpc_explore_batch, reward_fn, rstate) # get first latent of best trajectory for each batch np_latents = rollouts[2][:, 0] # rollout the first latent in simulator mpc_traj = self.sampler_mpc.obtain_samples(self.mpc_batch * self.max_path_length, self.max_path_length, np_to_var(np_latents), reset_args=cur_state) # update reward and reward state based on trajectory from simulator mpc_rew, rstate = self.eval_rewards(mpc_traj['obs'], reward_fn, rstate) # for logging and visualization purposes futures = rollouts[0] + total_mpc_rew total_mpc_rew += mpc_rew mpc_preds.append(rollouts[1][0]) mpc_span.append(rollouts[3]) mpc_stats = { 'mean futures': np.mean(futures), 'std futures': np.std(futures), 'mean actual': np.mean(total_mpc_rew), 'std actual': np.std(total_mpc_rew), } mpc_actual.append(mpc_traj['obs'][0]) with logger.prefix('itr #%d mpc step #%d | ' % (itr, T)): self.vae.print_diagnostics(mpc_stats) record_tabular(mpc_stats, 'mpc_stats.csv') # add current state to list of states explorer can initialize from all_inits.append(cur_state) # update current state to current state of simulator cur_state = mpc_traj['obs'][:, -1] # for visualization for idx, (actual, pred, rs, span) in enumerate( zip(mpc_actual, mpc_preds, rstates, mpc_span)): dataset.plot_pd_compare( [actual, pred, span[:100], span[:100, :dataset.path_len]], ['actual', 'pred', 'imagined', 'singlestep'], itr, save_dir='mpc_match', name='Pred' + str(idx), goals=goals, goalidx=rs[0]) # compute reward at final state, for some tasks that care about final state reward final_reward, _ = reward_fn(cur_state, rstate) print(total_mpc_rew) print(final_reward) # randomly select states for explorer to explore start_states = np.concatenate(all_inits, axis=0) start_states = start_states[np.random.choice( start_states.shape[0], self.rand_per_mpc_step, replace=self.rand_per_mpc_step > start_states.shape[0])] # run the explorer from those states explore_len = ((self.max_path_length + 1) * self.mpc_explore_len) - 1 self.policy_ex_algo.max_path_length = explore_len ex_trajs = self.sampler_ex.obtain_samples(start_states.shape[0] * explore_len, explore_len, None, reset_args=start_states) # Now concat actions taken by explorer with observations for adding to the dataset trajs = ex_trajs['obs'] obs = trajs[:, -1] if hasattr(self.action_space, 'shape') and len(self.action_space.shape) > 0: acts = get_numpy(ex_trajs['actions']) else: # convert discrete actions into onehot act_idx = get_numpy(ex_trajs['actions']) acts = np.zeros( (trajs.shape[0], trajs.shape[1] - 1, dataset.action_dim)) acts_reshape = acts.reshape((-1, dataset.action_dim)) acts_reshape[range(acts_reshape.shape[0]), act_idx.reshape(-1)] = 1.0 # concat actions with obs acts = np.concatenate((acts, acts[:, -1:, :]), 1) trajacts = np.concatenate((ex_trajs['obs'], acts), axis=-1) trajacts = trajacts.reshape( (-1, self.max_path_length + 1, trajacts.shape[-1])) # compute train/val split ntrain = min(int(0.9 * trajacts.shape[0]), dataset.buffer_size // self.add_frac) if dataset.n < dataset.batch_size and ntrain < dataset.batch_size: ntrain = dataset.batch_size nvalid = min(trajacts.shape[0] - ntrain, test_dataset.buffer_size // self.add_frac) if test_dataset.n < test_dataset.batch_size and nvalid < test_dataset.batch_size: nvalid = test_dataset.batch_size print("Adding ", ntrain, ", Valid: ", nvalid) dataset.add_samples(trajacts[:ntrain].reshape((ntrain, -1))) test_dataset.add_samples(trajacts[-nvalid:].reshape((nvalid, -1))) # dummy dataset stores only data from this iteration dummy_dataset.clear() dummy_dataset.add_samples(trajacts[:-nvalid].reshape( (trajacts.shape[0] - nvalid, -1))) # compute negative ELBO on trajectories of explorer neg_elbos = [] cur_batch = from_numpy(trajacts).float() for i in range(0, trajacts.shape[0], self.batch_size): mse, neg_ll, kl, bcloss, z_dist = self.vae.forward_batch( cur_batch[i:i + self.batch_size]) neg_elbo = (get_numpy(neg_ll) + get_numpy(kl)) neg_elbos.append(neg_elbo) # reward the explorer rewards = np.zeros_like(ex_trajs['rewards']) neg_elbos = np.concatenate(neg_elbos, axis=0) neg_elbos = neg_elbos.reshape((rewards.shape[0], -1)) # just not on the first iteration, since VAE hasnt fitted yet if itr != 1: rewidx = list( range(self.max_path_length, explore_len, self.max_path_length + 1)) + [explore_len - 1] for i in range(rewards.shape[0]): rewards[i, rewidx] = neg_elbos[i] # add in true reward to explorer if desired if self.true_reward_scale != 0: rstate = init_rstate(rewards.shape[0]) for oidx in range(rewards.shape[1]): r, rstate = reward_fn(ex_trajs['obs'][:, oidx], rstate) rewards[:, oidx] += r * self.true_reward_scale ex_trajs['rewards'] = rewards # train explorer using PPO with neg elbo self.policy_ex_algo.process_samples( 0, ex_trajs) #, augment_obs=get_numpy(z)) if itr != 1: self.policy_ex_algo.optimize_policy(0, ex_trajs) ex_trajs['stats']['MPC Actual'] = np.mean(total_mpc_rew) ex_trajs['stats']['Final Reward'] = np.mean(final_reward) # reset explorer if necessary if ex_trajs['stats']['Entropy'] < self.reset_ent: if hasattr(self.policy_ex, "prob_network"): self.policy_ex.prob_network.apply(xavier_init) else: self.policy_ex.apply(xavier_init) self.policy_ex.log_var_network.params_var.data = self.policy_ex.log_var_network.param_init # for visualization purposes colors = ['purple', 'magenta', 'green', 'black', 'yellow', 'black'] fig, ax = plt.subplots(3, 2, figsize=(10, 10)) for i in range(6): if i * 2 + 1 < obs.shape[1]: axx = ax[i // 2][i % 2] if i == 5: axx.scatter(obs[:, -3], obs[:, -2], color=colors[i], s=10) else: axx.scatter(obs[:, i * 2], obs[:, i * 2 + 1], color=colors[i], s=10) axx.set_xlim(-3, 3) axx.set_ylim(-3, 3) path = logger.get_snapshot_dir() + '/final_dist' if not os.path.exists(path): os.makedirs(path) plt.savefig('%s/%d.png' % (path, itr)) np.save(path + "/" + str(itr), obs) return ex_trajs['stats']