def step(self): config = self.config storage = VPGStorageBuffer(config.rollout_length) states = self.states for _ in range(config.rollout_length): action_tr, log_prob_tr, entropy_tr, v_tr = self.network( config.state_normalizer(states)) next_states, rewards, terminals, _ = self.task.step( toNumpy(action_tr)) self.online_rewards += rewards rewards = config.reward_normalizer(rewards) for i, terminal in enumerate(terminals): if terminals[i]: self.episode_rewards.append(self.online_rewards[i]) self.online_rewards[i] = 0 storage.store_next(states=toTensor(states), actions=action_tr, values=v_tr, log_pi=log_prob_tr, entropy=entropy_tr, rewards=toTensor(rewards).unsqueeze(-1), terminals=toTensor(1 - terminals).unsqueeze(-1)) states = next_states self.states = states action_tr, log_prob_tr, entropy_tr, v_tr = self.network( config.state_normalizer(states)) storage.values.append(v_tr) advantages = toTensor(np.zeros((config.num_workers, 1))) returns = v_tr.detach() for i in reversed(range(config.rollout_length)): returns = storage.rewards[ i] + config.discount * storage.terminals[i] * returns if not config.use_gae: advantages = returns - storage.values[i] else: td_error = storage.rewards[i] + config.discount * storage.terminals[i] * storage.values[i + 1] \ - storage.values[i] advantages = storage.terminals[ i] * config.gae_tau * config.discount * advantages + td_error storage.advantages[i] = advantages.detach() storage.returns[i] = returns.detach() states, actions, log_prob_old, returns, advantages = storage.cat( ['states', 'actions', 'log_pi', 'returns', 'advantages']) actions = actions.detach() log_prob_old = log_prob_old.detach() advantages = (advantages - advantages.mean()) / advantages.std() policy_loss = -(log_prob_old * advantages).mean() value_loss = 0.5 * (returns - value).pow(2).mean() entropy_loss = entropy.mean() loss = policy_loss - config.entropy_weight * entropy_loss + config.value_loss_weight * value_loss self.actor_optimizer.zero_grad() loss.backward() clip_grad_norm_(self.network.parameters(), config.gradient_clip) self.actor_optimizer.step() self.total_steps += config.rollout_length * config.num_workers
def eval_step(self, state): self.config.state_normalizer.set_read_only() state = self.config.state_normalizer(state) q = self.network(state) action = np.argmax(toNumpy(q)) self.config.state_normalizer.unset_read_only() return action
def step(self): config = self.config actions = self.network(self.states) actions = toNumpy(actions) actions += self.random_process.sample() next_states, rewards, dones, _ = self.task.step(actions) next_states = self.config.state_normalizer(next_states) rewards = self.config.reward_normalizer(rewards) self.replay.store([ self.states, actions, rewards, next_states, dones.astype(np.uint8) ]) if dones[0]: self.random_process.reset_states() self.states = next_states self.total_steps += 1 if self.replay.size >= config.min_memory_size: experiences = self.replay.sample(config.batch_size) states, actions, rewards, next_states, terminals = experiences states = states.squeeze(1) actions = actions.squeeze(1) rewards = toTensor(rewards) next_states = next_states.squeeze(1) terminals = toTensor(terminals) phi_next = self.target_network.feature(next_states) a_next = self.target_network.actor(phi_next) q_next = self.target_network.critic(phi_next, a_next) q_next = config.discount * q_next * (1 - terminals) q_next.add_(rewards) q_next = q_next.detach() phi = self.network.feature(states) q = self.network.critic(phi, toTensor(actions)) critic_loss = (q - q_next).pow(2).mul(0.5).sum(-1).mean() self.network.zero_grad() critic_loss.backward() self.network.critic_opt.step() phi = self.network.feature(states) action = self.network.actor(phi) policy_loss = -self.network.critic(phi.detach(), action).mean() self.network.zero_grad() policy_loss.backward() self.network.actor_opt.step() # soft_update for target_param, param in zip(self.target_network.parameters(), self.network.parameters()): target_param.detach_() target_param.copy_(target_param * (1.0 - self.config.target_network_mix) + param * self.config.target_network_mix)
def step(self): config = self.config # rollout for _ in range(self.config.rollout_length): # choose according to max(Q) q = self.network(config.state_normalizer(self.states)).mean(-1) epsilon = config.random_action_prob(config.num_workers) actions = epsilon_greedy(epsilon, toNumpy(q)) next_states, rewards, dones, infos = self.task.step(actions) state, reward, next_state, done, info = self.states[0], rewards[ 0], next_states[0], int(dones[0]), infos[0] self.states = next_states self.total_steps += 1 reward = config.reward_normalizer(reward) self.replay.store([state, actions[0], reward, next_state, done]) if self.total_steps > config.exploration_steps: # minibatch gradient descent experiences = self.replay.sample(config.batch_size) states, actions, rewards, next_states, terminals = experiences states = config.state_normalizer(states) next_states = config.state_normalizer(next_states) quantiles_next = self.target_network(next_states).detach() a_next = torch.argmax(quantiles_next.sum(-1), dim=-1) quantiles_next = quantiles_next[self.batch_indices, a_next, :] rewards = toTensor(rewards).unsqueeze(-1) terminals = toTensor(terminals).unsqueeze(-1) quantiles_next = rewards + self.config.discount * ( 1 - terminals) * quantiles_next quantiles = self.network(states) actions = toTensor(actions).long() quantiles = quantiles[self.batch_indices, actions, :] quantiles_next = quantiles_next.t().unsqueeze(-1) diff = quantiles_next - quantiles loss = huber_loss(diff) * (self.cumulative_density - (diff.detach() < 0).float()).abs() self.optimizer.zero_grad() loss.mean(0).mean(1).sum().backward() clip_grad_norm_(self.network.parameters(), self.config.gradient_clip) self.optimizer.step() if self.total_steps / config.rollout_length % config.target_network_update_freq == 0: self.target_network.load_state_dict(self.network.state_dict())
def __init__(self, config): super().__init__(config) self.network = config.network_fn() self.target_network = config.network_fn() self.target_network.load_state_dict(self.network.state_dict()) self.optimizer = config.optimizer_fn(self.network.parameters()) self.task = config.task_fn() self.states = config.state_normalizer(self.task.reset()) self.q_options, self.betas, self.log_pi = self.network(self.states) self.options = epsilon_greedy( config.random_option_prob(config.num_workers), toNumpy(self.q_options)) self.is_initial_betas = np.ones(self.config.num_workers) self.prev_options = np.copy(self.options)
def step(self): config = self.config # rollout for _ in range(self.config.rollout_length): # choose according to max(Q) q = self.network(config.state_normalizer(self.states)) epsilon = config.random_action_prob(config.num_workers) actions = epsilon_greedy(epsilon, toNumpy(q)) next_states, rewards, dones, infos = self.task.step(actions) rewards = config.reward_normalizer(rewards) self.replay.store([self.states[0], actions[0], rewards[0], next_states[0], dones[0]]) self.states = next_states self.total_steps += 1 if self.total_steps > config.exploration_steps: # minibatch gradient descent experiences = self.replay.sample(config.batch_size) states, actions, rewards, next_states, terminals = experiences states = config.state_normalizer(states) next_states = config.state_normalizer(next_states) q_next = self.target_network(next_states).detach() if config.double_q: best_actions = torch.argmax(self.network(next_states), dim=-1) q_next = q_next[self.batch_indices, best_actions] else: q_next = q_next.max(1)[0] terminals = toTensor(terminals) rewards = toTensor(rewards) q_next = rewards + config.discount * q_next * (1 - terminals) actions = toTensor(actions).long() q = self.network(states) q = q[self.batch_indices, actions] loss = (q_next - q).pow(2).mul(0.5).mean() self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.network.parameters(), config.gradient_clip) self.optimizer.step() if self.total_steps / config.rollout_length % config.target_network_update_freq == 0: self.target_network.load_state_dict(self.network.state_dict())
def step(self): config = self.config rollout = [] states = self.states for _ in range(self.config.rollout_length): # choose according to max(Q) q = self.network(config.state_normalizer(states)) epsilon = config.random_action_prob(config.num_workers) actions = epsilon_greedy(epsilon, toNumpy(q)) next_states, rewards, terminals, infos = self.task.step(actions) rewards = config.reward_normalizer(rewards) rollout.append([q, actions, rewards, 1 - terminals]) states = next_states self.total_steps += config.num_workers if self.total_steps / config.num_workers % config.target_network_update_freq == 0: self.target_network.load_state_dict(self.network.state_dict()) self.states = states processed_rollout = [None] * len(rollout) returns = self.target_network(config.state_normalizer(states)).detach() returns, _ = torch.max(returns, dim=-1, keepdim=True) for i in reversed(range(len(rollout))): q, actions, rewards, terminals = rollout[i] actions = toTensor(actions).unsqueeze(1).long() q = q.gather(1, actions) terminals = toTensor(terminals).unsqueeze(1) rewards = toTensor(rewards).unsqueeze(1) returns = rewards + config.discount * terminals * returns processed_rollout[i] = [q, returns] q, returns = map(lambda x: torch.cat(x, dim=0), zip(*processed_rollout)) # loss = F.smooth_l1_loss(q, returns) # loss = huber_loss(q - returns) loss = 0.5 * (q - returns).pow(2).mean() self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.network.parameters(), config.gradient_clip) self.optimizer.step() if self.total_steps / config.rollout_length % config.target_network_update_freq == 0: self.target_network.load_state_dict(self.network.state_dict())
def step(self): config = self.config storage = PPOStorageBuffer(config.rollout_length) states = self.states for _ in range(config.rollout_length): action_tr, log_prob_tr, entropy_tr, v_tr = self.network(states) next_states, rewards, terminals, infos = self.task.step( toNumpy(action_tr)) rewards = config.reward_normalizer(rewards) storage.store_next(states=toTensor(states), actions=action_tr, values=v_tr, log_pi=log_prob_tr, entropy=entropy_tr, rewards=toTensor(rewards).unsqueeze(-1), terminals=toTensor(1 - terminals).unsqueeze(-1)) states = config.state_normalizer(next_states) self.states = states action_tr, log_prob_tr, entropy_tr, v_tr = self.network(states) storage.values.append(v_tr) advantages = toTensor(np.zeros((config.num_workers, 1))) returns = v_tr.detach() for i in reversed(range(config.rollout_length)): returns = storage.rewards[ i] + config.discount * storage.terminals[i] * returns if not config.use_gae: advantages = returns - storage.values[i] else: td_error = storage.rewards[i] + config.discount * storage.terminals[i] * storage.values[i + 1] \ - storage.values[i] advantages = storage.terminals[ i] * config.gae_tau * config.discount * advantages + td_error storage.advantages[i] = advantages.detach() storage.returns[i] = returns.detach() states, actions, log_prob_old, returns, advantages = storage.cat( ['states', 'actions', 'log_pi', 'returns', 'advantages']) actions = actions.detach() log_prob_old = log_prob_old.detach() advantages = (advantages - advantages.mean()) / advantages.std() for _ in range(config.optimization_epochs): sampler = random_sample(np.arange(states.size(0)), config.mini_batch_size) for batch_indices in sampler: batch_indices = toTensor(batch_indices).long() sampled_states = states[batch_indices] sampled_actions = actions[batch_indices] sampled_log_prob_old = log_prob_old[batch_indices] sampled_returns = returns[batch_indices] sampled_advantages = advantages[batch_indices] action_tr, log_prob_tr, entropy_tr, v_tr = self.network( sampled_states, sampled_actions) ratio = (log_prob_tr - sampled_log_prob_old).exp() obj = ratio * sampled_advantages obj_clipped = ratio.clamp( 1.0 - config.ppo_ratio_clip, 1.0 + config.ppo_ratio_clip) * sampled_advantages policy_loss = -torch.min(obj, obj_clipped).mean() \ - config.entropy_weight * entropy_tr.mean() value_loss = 0.5 * (sampled_returns - v_tr).pow(2).mean() self.optimizer.zero_grad() (policy_loss + value_loss).backward() clip_grad_norm_(self.network.parameters(), config.gradient_clip) self.optimizer.step() self.total_steps += config.rollout_length * config.num_workers
def step(self): config = self.config # rollout for _ in range(self.config.rollout_length): # choose according to max(Q) probs, _ = self.network(config.state_normalizer(self.states)) q = (probs * self.atoms).sum(-1) epsilon = config.random_action_prob(config.num_workers) actions = epsilon_greedy(epsilon, toNumpy(q)) next_states, rewards, dones, infos = self.task.step(actions) state, reward, next_state, done, info = self.states[0], rewards[ 0], next_states[0], int(dones[0]), infos[0] self.states = next_states self.total_steps += 1 reward = config.reward_normalizer(reward) self.replay.store([state, actions[0], reward, next_state, done]) if self.total_steps > config.exploration_steps: # minibatch gradient descent experiences = self.replay.sample(config.batch_size) states, actions, rewards, next_states, terminals = experiences states = config.state_normalizer(states) next_states = config.state_normalizer(next_states) prob_next, _ = self.target_network(next_states) prob_next = prob_next.detach() q_next = (prob_next * self.atoms).sum(-1) a_next = torch.argmax(q_next, dim=-1) prob_next = prob_next[self.batch_indices, a_next, :] rewards = toTensor(rewards).unsqueeze(-1) terminals = toTensor(terminals).unsqueeze(-1) atoms_next = rewards + self.config.discount * ( 1 - terminals) * self.atoms.view(1, -1) atoms_next.clamp_(self.config.categorical_v_min, self.config.categorical_v_max) b = (atoms_next - self.config.categorical_v_min) / self.delta_atom l = b.floor() u = b.ceil() d_m_l = (u + (l == u).float() - b) * prob_next d_m_u = (b - l) * prob_next target_prob = toTensor(np.zeros(prob_next.size())) for i in range(target_prob.size(0)): target_prob[i].index_add_(0, l[i].long(), d_m_l[i]) target_prob[i].index_add_(0, u[i].long(), d_m_u[i]) _, log_prob = self.network(states) actions = toTensor(actions).long() log_prob = log_prob[self.batch_indices, actions, :] loss = -(target_prob * log_prob).sum(-1).mean() self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.network.parameters(), self.config.gradient_clip) self.optimizer.step() if self.total_steps / config.rollout_length % config.target_network_update_freq == 0: self.target_network.load_state_dict(self.network.state_dict())