def unpack_ppo_batch(self, batch): """ batch.state: tuple of num_episodes of FloatTensor (1, t, state-dim), where t is variable batch.action: tuple of num_episodes of tuples of (LongTensor (1), LongTensor (1)) for (action, index) batch.reward: tuple of num_episodes scalars in {0,1} batch.masks: tuple of num_episodes scalars in {0,1} batch.value: tuple of num_episodes Variable FloatTensor of (1,1) batch.logprob: tuple of num_episodes of tuples of (FloatTensor (1), FloatTensor (1)) for (action_logprob, index_logprob) states is not a variable actions is not a variable: tuple of length (B) of LongTensor (1) secondary_actions is not a variable: tuple of length (B) of LongTensor (1) action_logprobs is not a variable (B) secondary_log_probs is not a variable (B) values is not a variable: (B, 1) rewards is not a variable: FloatTensor (B) masks is not a variable: FloatTensor (B) perm_idx is a tuple group_idx is an array """ lengths = [e.size(1) for e in batch.state] perm_idx, sorted_lengths = u.sort_decr(lengths) group_idx, group_lengths = u.group_by_element(sorted_lengths) states = batch.state # tuple of num_episodes of FloatTensor (1, t, state-dim), where t is variable actions, secondary_actions = zip(*batch.action) action_logprobs, secondary_log_probs = zip(*batch.logprob) action_logprobs = torch.cat(action_logprobs).data # FloatTensor (B) secondary_log_probs = torch.cat(secondary_log_probs).data # FloatTensor (B) values = torch.cat(batch.value).data # FloatTensor (B, 1) rewards = u.cuda_if_needed(torch.from_numpy(np.stack(batch.reward)).float(), self.args) # FloatTensor (b) masks = u.cuda_if_needed(torch.from_numpy(np.stack(batch.mask)).float(), self.args) # FloatTensor (b) return states, actions, secondary_actions, action_logprobs, secondary_log_probs, values, rewards, masks, perm_idx, group_idx
def get_log_prob(self, state, action, secondary_action): b, t, d = state.size() action_dist, rnn_out, summarized_rnn_out = self.forward(state) action_log_prob = logprob_categorical_dist(action_dist, action) if action.data[0] == 2: # STOP stop_dist = cuda_if_needed(Variable(torch.ones(1)), self.args) secondary_log_prob = logprob_categorical_dist( stop_dist, secondary_action) elif action.data[0] == 1: # REDUCTION indices = self.get_indices(state.data) if indices.sum() > 0: reduction_scores = self.get_reducer_dist(rnn_out) reduction_dist = F.softmax(reduction_scores, dim=-1) else: reduction_dist = cuda_if_needed( Variable(torch.ones(b, 1), volatile=action.volatile), self.args) secondary_log_prob = logprob_categorical_dist( reduction_dist, secondary_action) elif action.data[0] == 0: # TRANSLATE translator_scores = self.get_translator_dist(summarized_rnn_out) translator_dist = F.softmax(translator_scores, dim=-1) secondary_log_prob = logprob_categorical_dist( translator_dist, secondary_action) else: assert False return action_log_prob, secondary_log_prob
def ppo_step(self, num_value_iters, states, actions, indices, returns, advantages, fixed_action_logprobs, fixed_index_logprobs, lr_mult, lr, clip_epsilon, l2_reg): clip_epsilon = clip_epsilon * lr_mult """update critic""" values_target = Variable(u.cuda_if_needed(returns, self.args)) # (mb, 1) for k in range(num_value_iters): values_pred = self.valuefn(Variable(states)) # (mb, 1) value_loss = (values_pred - values_target).pow(2).mean() # weight decay for param in self.valuefn.parameters(): value_loss += param.pow(2).sum() * l2_reg self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() """update policy""" advantages_var = Variable(u.cuda_if_needed(advantages, self.args)).view(-1) # (mb) ######################################## perm_idx, sorted_actions = u.sort_decr(actions) inverse_perm_idx = u.invert_permutation(perm_idx) group_idx, group_actions = u.group_by_element(sorted_actions) # permute everything by action type states_ap, actions_ap, indices_ap = map(lambda x: u.permute(x, perm_idx), [states, actions, indices]) # group everything by action type states_ag, actions_ag, indices_ag = map(lambda x: u.group_by_indices(x, group_idx), [states_ap, actions_ap, indices_ap]) action_logprobs, index_logprobs = [], [] for grp in xrange(len(group_idx)): states_grp = torch.stack(states_ag[grp]) # (g, grp_length, indim) actions_grp = torch.LongTensor(np.stack(actions_ag[grp])) # (g) indices_grp = torch.LongTensor(np.stack(indices_ag[grp])) # (g) actions_grp = u.cuda_if_needed(actions_grp, self.args) indices_grp = u.cuda_if_needed(indices_grp, self.args) alp, ilp = self.policy.get_log_prob(Variable(states_grp), Variable(actions_grp), Variable(indices_grp)) action_logprobs.append(alp) index_logprobs.append(ilp) action_logprobs = torch.cat(action_logprobs) index_logprobs = torch.cat(index_logprobs) # unpermute inverse_perm_idx = u.cuda_if_needed(torch.LongTensor(inverse_perm_idx), self.args) action_logprobs = action_logprobs[inverse_perm_idx] index_logprobs = index_logprobs[inverse_perm_idx] ######################################## ratio = torch.exp(action_logprobs + index_logprobs - Variable(fixed_action_logprobs) - Variable(fixed_index_logprobs)) surr1 = ratio * advantages_var # (mb) surr2 = torch.clamp(ratio, 1.0 - clip_epsilon, 1.0 + clip_epsilon) * advantages_var # (mb) policy_surr = -torch.min(surr1, surr2).mean() self.policy_optimizer.zero_grad() policy_surr.backward() torch.nn.utils.clip_grad_norm(self.policy.parameters(), 40) self.policy_optimizer.step()
def improve_policy_ppo(self): optim_epochs = self.args.ppo_optim_epochs # can anneal this minibatch_size = self.args.ppo_minibatch_size num_value_iters = self.args.ppo_value_iters clip_epsilon = self.args.ppo_clip gamma = self.args.gamma tau = 0.95 l2_reg = 1e-3 batch = self.replay_buffer.sample() all_states, all_actions, all_indices, all_fixed_action_logprobs, all_fixed_index_logprobs, all_values, all_rewards, all_masks, perm_idx, group_idx = self.unpack_ppo_batch(batch) all_advantages, all_returns = self.estimate_advantages(all_rewards, all_masks, all_values, gamma, tau) # (b, 1) (b, 1) # permute everything by length states_p, actions_p, indices_p, returns_p, advantages_p, fixed_action_logprobs_p, fixed_index_logprobs_p = map( lambda x: u.permute(x, perm_idx), [all_states, all_actions, all_indices, all_returns, all_advantages, all_fixed_action_logprobs, all_fixed_index_logprobs]) # group everything by length states_g, actions_g, indices_g, returns_g, advantages_g, fixed_action_logprobs_g, fixed_index_logprobs_g = map( lambda x: u.group_by_indices(x, group_idx), [states_p, actions_p, indices_p, returns_p, advantages_p, fixed_action_logprobs_p, fixed_index_logprobs_p]) for j in range(optim_epochs): for grp in range(len(group_idx)): states = torch.cat(states_g[grp], dim=0) # FloatTensor (g, grp_length, indim) actions = torch.cat(actions_g[grp]) # LongTensor (g) indices = torch.cat(indices_g[grp]) # LongTensor (g) returns = torch.cat(returns_g[grp]) # FloatTensor (g) advantages = torch.cat(advantages_g[grp]) # FloatTensor (g) fixed_action_logprobs = u.cuda_if_needed(torch.FloatTensor(fixed_action_logprobs_g[grp]), self.args) # FloatTensor (g) fixed_index_logprobs = u.cuda_if_needed(torch.FloatTensor(fixed_index_logprobs_g[grp]), self.args) # FloatTensor (g) for x in [states, actions, indices, returns, advantages, fixed_action_logprobs, fixed_index_logprobs]: assert not isinstance(x, torch.autograd.variable.Variable) perm = np.random.permutation(range(states.shape[0])) perm = u.cuda_if_needed(torch.LongTensor(perm), self.args) states, actions, indices, returns, advantages, fixed_action_logprobs, fixed_index_logprobs = \ states[perm], actions[perm], indices[perm], returns[perm], advantages[perm], fixed_action_logprobs[perm], fixed_index_logprobs[perm] optim_iter_num = int(np.ceil(states.shape[0] / float(minibatch_size))) for i in range(optim_iter_num): ind = slice(i * minibatch_size, min((i + 1) * minibatch_size, states.shape[0])) states_b, actions_b, indices_b, advantages_b, returns_b, fixed_action_logprobs_b, fixed_index_logprobs_b = \ states[ind], actions[ind], indices[ind], advantages[ind], returns[ind], fixed_action_logprobs[ind], fixed_index_logprobs[ind] self.ppo_step(num_value_iters, states_b, actions_b, indices_b, returns_b, advantages_b, fixed_action_logprobs_b, fixed_index_logprobs_b, 1, self.args.plr, clip_epsilon, l2_reg)
def pad(self, encoder_out): b, d = encoder_out.size() encoder_out = encoder_out.unsqueeze(1) # unsqueeze the time dimension padding = cuda_if_needed(Variable(torch.zeros(b, self.outlength-1, d)), self.args) padded_encoder_out = torch.cat((encoder_out, padding), dim=1) padded_encoder_out = padded_encoder_out.contiguous() return padded_encoder_out
def create_lang_batch(env, bsize, mode, args): volatile = mode != 'train' z = 1 whole_expr = np.random.binomial(n=1, p=0.5) enc_inps = [] target_tokens = [] zs = [] targets = [] for j in range(bsize): initial, target = env.reset(mode, z) enc_inps.append( np.stack([du.num2onehot(x, env.vocabsize) for x in initial[0]])) target_tokens.append(du.num2onehot(initial[1], env.langsize)) zs.append(du.num2onehot(initial[2], env.zsize)) targets.append(target) env.change_mt() enc_inps = torch.FloatTensor( np.array(enc_inps)) # (b, inp_seq_length, vocabsize) target_tokens = torch.FloatTensor(target_tokens) # (b, langsize) zs = torch.FloatTensor(zs) # (b, zsize) targets = torch.LongTensor(targets) # (b, 1) enc_inps, target_tokens, zs, targets = map( lambda x: cuda_if_needed(x, args), (enc_inps, target_tokens, zs, targets)) targets = Variable(targets, volatile=volatile) return (enc_inps, target_tokens, zs), targets
def init_hidden(self, bsize): (num_directions, h_hdim) = (2, self.hdim // 2) if self.args.bidirectional else (1, self.hdim) return cuda_if_needed( Variable(torch.zeros(self.nlayers * num_directions, bsize, h_hdim)), self.args)
def improve(self, args): ########################################################################### batch = self.replay_buffer.sample() states, actions, rewards, masks = self.unpack_ppo_batch( batch) # none of these are Variables, so we are good states, actions, rewards, masks = map( lambda x: u.cuda_if_needed(x, args), (states, actions, rewards, masks)) values = self.valuefn(Variable(states, volatile=True)).data # (b, 1) fixed_log_probs = self.policy.get_log_prob( Variable(states, volatile=True), Variable(actions)).data # (b) advantages, returns = self.estimate_advantages(rewards, masks, values) # (b, 1) (b, 1) optim_iter_num = int( np.ceil(states.shape[0] / float(self.minibatch_size))) for j in range(self.optim_epochs): perm = np.random.permutation(range(states.shape[0])) perm = u.cuda_if_needed(torch.LongTensor(perm), args) states, actions, returns, advantages, fixed_log_probs = \ states[perm], actions[perm], returns[perm], advantages[perm], fixed_log_probs[perm] for i in range(optim_iter_num): ind = slice( i * self.minibatch_size, min((i + 1) * self.minibatch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] minibatch = { 'states': states_b, 'actions': actions_b, 'returns': returns_b, 'advantages': advantages_b, 'fixed_log_probs': fixed_log_probs_b } self.ppo_step(minibatch=minibatch, args=args)
def select_action(self, state): b, t, d = state.size() action_dist, rnn_out, summarized_rnn_out = self.forward(state) """ action_dist: (b, 3) rnn_out: (b, t, hdim) summarized_rnn_out: (b, hdim) """ action = sample_from_categorical_dist(action_dist) # Variable (b) if action.data[0] == 2: # STOP stop_dist = cuda_if_needed(Variable(torch.ones(1)), self.args) # dummy secondary_action = sample_from_categorical_dist(stop_dist) elif action.data[0] == 1: # REDUCE indices = self.get_indices(state.data) if indices.sum() > 0: reduction_scores = self.get_reducer_dist(rnn_out) reduction_dist = F.softmax(reduction_scores, dim=-1) else: reduction_dist = cuda_if_needed( Variable(torch.ones(b, 1), volatile=action.volatile), self.args) secondary_action = sample_from_categorical_dist(reduction_dist) elif action.data[0] == 0: # TRANSLATE translator_scores = self.get_translator_dist(summarized_rnn_out) translator_dist = F.softmax(translator_scores, dim=-1) secondary_action = sample_from_categorical_dist(translator_dist) else: assert False dist_type = action.data[0] if action.data[0] == 2: choice_dist = stop_dist.data.cpu().squeeze().numpy() elif action.data[0] == 1: choice_dist = reduction_dist.data.cpu().squeeze().numpy() elif action.data[0] == 0: choice_dist = translator_dist.data.cpu().squeeze().numpy() else: assert False meta_dist = action_dist.data.cpu().squeeze().numpy() return action.data, secondary_action.data, (dist_type, choice_dist, meta_dist)
def ppo_step(self, minibatch, args): states = minibatch['states'] actions = minibatch['actions'] returns = minibatch['returns'] advantages = minibatch['advantages'] fixed_log_probs = minibatch['fixed_log_probs'] ########################################################################### self.clip_epsilon = self.clip_epsilon * self.lr_mult # NOTE: this is deprecated. Set self.lr_mult=1. We can anneal based on pytorch's scheduler. """update critic""" values_target = Variable(u.cuda_if_needed(returns, args)) # (mb, 1) for k in range(self.value_iters): values_pred = self.valuefn(Variable(states)) # (mb, 1) value_loss = (values_pred - values_target).pow(2).mean() # weight decay for param in self.valuefn.parameters(): value_loss += param.pow(2).sum() * self.l2_reg self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() """update policy""" advantages_var = Variable(u.cuda_if_needed(advantages, args)).view(-1) # (mb) log_probs = self.policy.get_log_prob(Variable(states), Variable(actions)) # (mb) probs = torch.exp(log_probs) # (mb) entropy = torch.sum(-(log_probs * probs)) # (1) ratio = torch.exp(log_probs - Variable(fixed_log_probs)) # (mb) surr1 = ratio * advantages_var # (mb) surr2 = torch.clamp(ratio, 1.0 - self.clip_epsilon, 1.0 + self.clip_epsilon) * advantages_var # (mb) policy_surr = -torch.min(surr1, surr2).mean( ) - self.entropy_coeff * entropy # (1) subtract entropy! self.policy_optimizer.zero_grad() policy_surr.backward() torch.nn.utils.clip_grad_norm(self.policy.parameters(), 40) self.policy_optimizer.step()