def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) # evaluation_env = deep_clone(env) if self._eval_n_episodes else None # if self.high_lv_control: # evaluation_env = env # else: evaluation_env = deep_clone(env) if self._eval_n_episodes else None # TODO: use Ezpickle to deep_clone??? with tf.get_default_session().as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() # Added to render # if self._eval_render: # from schema.utils.sampler_utils import rollout # rollout(self.env, self.policy, max_path_length=1000, animated=True) self.sampler.terminate()
def _evaluate(self, policy, evaluation_env): """Perform evaluation for the current policy.""" if self._eval_n_episodes < 1: return # TODO: max_path_length should be a property of environment. # input = None if self.high_lv_control else self._action_dim paths = rollouts(evaluation_env, policy, self.sampler._max_path_length, self._eval_n_episodes, input) total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) # TODO: figure out how to pass log_diagnostics through evaluation_env.log_diagnostics(paths) if self._eval_render: evaluation_env.render(paths) if self.sampler.batch_ready(): batch = self.sampler.random_batch() self.log_diagnostics(batch)
def train(self, envs): self.training_step = 0 best_reward = 0 visited_rooms = set() eplen = 0 rollout_idx = 0 state = np.transpose(envs.reset(), (0, 3, 1, 2)) # rollout while rollout_idx < self.num_rollouts: states = np.zeros((self.num_steps, self.num_envs, 1, 84, 84), np.float32) actions = np.zeros((self.num_steps, self.num_envs), np.int32) action_log_probs = np.zeros((self.num_steps, self.num_envs), np.float32) rewards = np.zeros((self.num_steps, self.num_envs), np.float32) next_states = np.zeros((self.num_steps, self.num_envs, 1, 84, 84), np.float32) dones = np.zeros((self.num_steps, self.num_envs), np.int32) current_best_reward = 0 hidden = None for t in range(self.num_steps): action, action_log_prob, hidden = self.select_action( state, hidden) next_state, reward, done, info = envs.step(action) # TensorFlow format to PyTorch next_state = np.transpose(next_state, (0, 3, 1, 2)) # transitions states[t, ...] = state actions[t, ...] = action action_log_probs[t, ...] = action_log_prob rewards[t, ...] = reward next_states[t, ...] = next_state dones[t, ...] = done if self.render: envs.render(0) state = next_state # done for i, dne in enumerate(done): if dne: epinfo = info[i]['episode'] if 'visited_rooms' in epinfo: visited_rooms |= epinfo['visited_rooms'] best_reward = max(epinfo['r'], best_reward) current_best_reward = max(epinfo['r'], current_best_reward) eplen += epinfo['l'] # logger logger.info('GAME STATUS') logger.record_tabular('rollout_idx', rollout_idx) logger.record_tabular( 'visited_rooms', str(len(visited_rooms)) + ', ' + str(visited_rooms)) logger.record_tabular('best_reward', best_reward) logger.record_tabular('current_best_reward', current_best_reward) logger.record_tabular('eplen', eplen) logger.dump_tabular() # train neural networks self.update_parameters(states, actions, action_log_probs, rewards, next_states, dones) rollout_idx += 1
def update_parameters(self, states, actions, action_log_probs, rewards, next_states, dones): # T * B * features states = torch.from_numpy(states).to(dtype=torch.float32, device=self.device) actions = torch.from_numpy(actions).to(dtype=torch.int32, device=self.device) old_action_log_probs = torch.from_numpy(action_log_probs).to( dtype=torch.float32, device=self.device) rewards = torch.from_numpy(rewards).to(dtype=torch.float32, device=self.device) next_states = torch.from_numpy(next_states).to(dtype=torch.float32, device=self.device) masks = 1 - torch.from_numpy(dones).to(dtype=torch.float32, device=self.device) # GENERALIZED ADVANTAGE ESTIMATION with torch.no_grad(): advantages = torch.zeros_like(rewards) _, values, _ = self.actor_critic( torch.cat([states, next_states[-1].unsqueeze(0)], dim=0)) values = values.squeeze(2) # remove last dimension last_gae_lam = 0 for t in range(self.num_steps - 1, -1, -1): delta = rewards[t] + masks[t] * \ self.gamma * values[t + 1] - values[t] advantages[t, :] = delta + masks[t] * \ self.lamda * self.gamma * last_gae_lam last_gae_lam = advantages[t] returns = advantages + values[:-1] logger.info('GENERALIZED ADVANTAGE ESTIMATION') logger.record_tabular('advantages mean', advantages.mean(dim=(0, 1))) logger.record_tabular('advantages std', advantages.std(dim=(0, 1))) logger.record_tabular('returns mean', returns.mean(dim=(0, 1))) logger.record_tabular('returns std', returns.std(dim=(0, 1))) logger.dump_tabular() # train epochs for epoch_idx in range(self.update_epochs): self.training_step += 1 # sample (T * B * features) slic = random.sample(list(range(self.num_envs)), self.sample_envs) state = states[:, slic, ...].contiguous() action = actions[:, slic, ...] old_action_log_prob = old_action_log_probs[:, slic, ...] retur = returns[:, slic, ...] advantage = advantages[:, slic, ...] # policy loss dist, value, _ = self.actor_critic(state) action_log_prob = dist.log_prob(action) ratio = torch.exp(action_log_prob - old_action_log_prob) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - self.clip_range, 1.0 + self.clip_range) * advantage action_loss = -torch.mean(torch.min(surr1, surr2), dim=(0, 1)) # value loss smooth_l1_loss = nn.SmoothL1Loss(reduction='mean') value_loss = smooth_l1_loss(retur.flatten(), value.flatten()) # entropy loss entropy_loss = -torch.mean(dist.entropy(), dim=(0, 1)) # backprop loss = action_loss + value_loss + self.coeff_ent * entropy_loss self.optimizer.zero_grad() loss.backward() if self.max_grad_norm > 1e-8: nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm) self.optimizer.step() if self.training_step % 10000 == 0: self.save_param(self.saved_path) logger.info('UPDATE') logger.record_tabular('training_step', self.training_step) logger.record_tabular('value_loss', value_loss.item()) logger.record_tabular('policy_loss', action_loss.item()) logger.record_tabular('entropy_loss', entropy_loss.item()) logger.dump_tabular()
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') X_v, vtarg_n_v, loss2, loss_sampled2 = vf.update_info optim2 = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ async=0, kfac_update=2, cold_iter=50, \ weight_decay_dict=vf.wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op2 = optim2.minimize(loss2, loss_sampled2, var_list=vf_var_list) ob_p, oldac_p, adv_p, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=0, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op = optim.minimize(loss, loss_sampled, var_list=pi_var_list) sess = tf.get_default_session() sess.run(tf.variables_initializer(set(tf.global_variables()))) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function paths_ = [] for p in paths: l = pathlength(p) act = p["action_dist"].astype('float32') paths_.append( np.concatenate([p['observation'], act, np.ones((l, 1))], axis=1)) X1 = np.concatenate(paths_) y = np.concatenate(vtargs) logger.record_tabular("EVBefore", explained_variance(vf._predict(X1), y)) # for _ in range(20): # sess.run(update_op2, {X_v:X1, vtarg_n_v:y}) #do_update2(X, y) logger.record_tabular("EVAfter", explained_variance(vf._predict(X1), y)) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update sess.run(update_op, { ob_p: ob_no, oldac_p: action_na, adv_p: standardized_adv_n }) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1
def train(self, env): # Memory memory = ReplayBuffer(capacity=self.replay_size) # Training Loop total_numsteps = 0 updates = 0 for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0 done = False state = env.reset() while not done: if total_numsteps < self.start_steps: action = env.action_space.sample() # Sample random action else: # Sample action from policy action = self.select_action(state) if len(memory) > self.batch_size: # Number of updates per step in environment for i in range(self.updates_per_step): # Update parameters of all the networks q1_loss, q2_loss, policy_loss, alpha_loss = self.update_parameters( memory, self.batch_size, updates) updates += 1 next_state, reward, done, _ = env.step(action) # Step episode_steps += 1 total_numsteps += 1 episode_reward += reward if self.render: env.render() # Ignore the "done" signal if it comes from hitting the time horizon. # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py) done = 0 if episode_steps == env._max_episode_steps else done memory.push(state, action, reward, next_state, done) # Append transition to memory state = next_state logger.info('UPDATE') logger.record_tabular('q1_loss', q1_loss) logger.record_tabular('q2_loss', q2_loss) logger.record_tabular('policy_loss', policy_loss) logger.record_tabular('alpha_loss', alpha_loss) logger.dump_tabular() logger.info('STATUS') logger.record_tabular('i_episode', i_episode) logger.record_tabular('episode_steps', episode_steps) logger.record_tabular('total_numsteps', total_numsteps) logger.record_tabular('episode_reward', episode_reward) logger.dump_tabular() if i_episode % 100 == 0: logger.info('SAVE') self.save_model('../saved/sac') if total_numsteps > self.num_steps: return
def train(self, envs): self.training_step = 0 best_reward = torch.zeros((1,), device=self.device) eplen = torch.zeros((1,), device=self.device, dtype=torch.int32) visited_rooms = set() rollout_idx = 0 state = np.transpose(envs.reset(), (0, 3, 1, 2)) # rollout while rollout_idx < self.num_rollouts: # sync model distributed_util.sync_model(self.actor_critic) states = np.zeros( (self.num_steps, self.num_envs, 1, 84, 84), np.float32) actions = np.zeros((self.num_steps, self.num_envs), np.int32) action_log_probs = np.zeros( (self.num_steps, self.num_envs), np.float32) rewards = np.zeros((self.num_steps, self.num_envs), np.float32) next_states = np.zeros( (self.num_steps, self.num_envs, 1, 84, 84), np.float32) dones = np.zeros((self.num_steps, self.num_envs), np.int32) current_best_reward = torch.zeros((1,), device=self.device) hidden = None for t in range(self.num_steps): action, action_log_prob, hidden = self.select_action( state, hidden) next_state, reward, done, info = envs.step(action) # TensorFlow format to PyTorch next_state = np.transpose(next_state, (0, 3, 1, 2)) # transitions states[t, ...] = state actions[t, ...] = action action_log_probs[t, ...] = action_log_prob rewards[t, ...] = reward next_states[t, ...] = next_state dones[t, ...] = done if self.render: envs.render(0) state = next_state # done for i, dne in enumerate(done): if dne: epinfo = info[i]['episode'] if 'visited_rooms' in epinfo: visited_rooms |= epinfo['visited_rooms'] best_reward[0] = max(epinfo['r'], best_reward[0]) current_best_reward[0] = max( epinfo['r'], current_best_reward[0]) eplen[0] += epinfo['l'] # logger dist.all_reduce(best_reward, op=dist.ReduceOp.MAX) dist.all_reduce(current_best_reward, op=dist.ReduceOp.MAX) # TODO: sync visited_rooms if self.rank == 0: logger.info('GAME STATUS') logger.record_tabular('rollout_idx', rollout_idx) logger.record_tabular('visited_rooms', str(len(visited_rooms)) + ', ' + str(visited_rooms)) logger.record_tabular('best_reward', best_reward.item()) logger.record_tabular( 'current_best_reward', current_best_reward.item()) logger.record_tabular( 'eplen', eplen.item() * dist.get_world_size()) logger.dump_tabular() # train neural networks self.update_parameters(states, actions, action_log_probs, rewards, next_states, dones) rollout_idx += 1