def learn(self, states, actions, action_probs, returns, next_states): values = self.values(states) advantages = returns - values advantages_normalized = (advantages - advantages.mean()) / ( advantages.std() + 1.0e-10) # print(f"returns[0]: {returns[0]}") # print(f"values[0]: {values[0]}") # print(f"advantages[0]: {advantages[0]}") # print(f"advantages_normalized[0]: {advantages_normalized[0]}") policy_objectives = [] value_losses = [] entropy_values = [] for _ in range(self.optimization_epochs): batcher = Batcher(num_data_points=len(states), batch_size=self.batch_size) batcher.shuffle() for batch in batcher.batches(): sampled_advantages = advantages_normalized[batch] sampled_states = states[batch] sampled_action_probs = action_probs[batch] sampled_actions = actions[batch] sampled_returns = returns[batch] policy_objective, value_loss, entropy_value = self.learn_from_samples( sampled_advantages=sampled_advantages, sampled_states=sampled_states, sampled_action_probs=sampled_action_probs, sampled_actions=sampled_actions, sampled_returns=sampled_returns) policy_objectives.append(policy_objective) value_losses.append(value_loss) entropy_values.append(entropy_value) # the clipping parameter reduces as time goes on # self.epsilon *= 0.999 # the regulation term also reduces # this reduces exploration in later runs self.entropy_weight *= self.entropy_reduction_rate return np.mean(policy_objectives), np.mean(value_losses), np.mean( entropy_values)
from att_lstm_model import AttLSTM_Model model = AttLSTM_Model(num_classes,max_length,len(dicts['token2id']),embd\ ,emb_dim = input_dim,hidden_dim=hidden_dim,lr=0.001,num_ways=num_ways) num_back = 0 step_per_epoch = train_batcher.max_batch_num best_acc = 0 best_rep = "" best_dev_rep = "" train_batcher.shuffle() for epoch in range(num_epoch): loss = 0.0 print "Epoch %d" % epoch for i in range(step_per_epoch): random_ = np.random.random_sample() if not use_concepts: input_x, y, targets, lengths = train_batcher.next()
[x_context_data, x_target_mean_data, y_data, feature_data] = test_batcher.next() test_feed = {y:y_data,keep_prob_context:[1],keep_prob_target:[1],feature:feature_data} for i in range(args.context_length*2+1): test_feed[x_context[i]] = x_context_data[:,i,:] test_feed[x_target] = x_target_mean_data [x_context_data, x_target_mean_data, y_data, feature_data] = dev_batcher.next() dev_feed = {y:y_data,keep_prob_context:[1],keep_prob_target:[1],feature:feature_data} for i in range(args.context_length*2+1): dev_feed[x_context[i]] = x_context_data[:,i,:] dev_feed[x_target] = x_target_mean_data ite = 0 train_batcher.shuffle() # TRAINING l = 0. for step in range(50000001): [x_context_data, x_target_mean_data, y_data, feature_data] = train_batcher.next() feed = {y:y_data,keep_prob_context:[args.keep_prob_context],keep_prob_target:[args.keep_prob_target],feature:feature_data} for i in range(args.context_length*2+1): feed[x_context[i]] = x_context_data[:,i,:] feed[x_target] = x_target_mean_data sess.run(optimizer,feed_dict = feed) l += sess.run(loss,feed_dict = feed) if step % 50 == 0 and step > 1: ite += 1 #print step,l/100.
def step(self): rollout = [] hyperparameters = self.hyperparameters env_info = self.environment.reset(train_mode=True)[self.brain_name] self.states = env_info.vector_observations states = self.states for _ in range(hyperparameters['rollout_length']): actions, log_probs, _, values = self.network(states) env_info = self.environment.step(actions.cpu().detach().numpy())[self.brain_name] next_states = env_info.vector_observations rewards = env_info.rewards terminals = np.array([1 if t else 0 for t in env_info.local_done]) self.all_rewards += rewards for i, terminal in enumerate(terminals): if terminals[i]: self.episode_rewards.append(self.all_rewards[i]) self.all_rewards[i] = 0 rollout.append([states, values.detach(), actions.detach(), log_probs.detach(), rewards, 1 - terminals]) states = next_states self.states = states pending_value = self.network(states)[-1] rollout.append([states, pending_value, None, None, None, None]) processed_rollout = [None] * (len(rollout) - 1) advantages = torch.Tensor(np.zeros((self.config['environment']['number_of_agents'], 1))) returns = pending_value.detach() for i in reversed(range(len(rollout) - 1)): states, value, actions, log_probs, rewards, terminals = rollout[i] terminals = torch.Tensor(terminals).unsqueeze(1) rewards = torch.Tensor(rewards).unsqueeze(1) actions = torch.Tensor(actions.cpu()) states = torch.Tensor(states) next_value = rollout[i + 1][1] returns = rewards + hyperparameters['discount_rate'] * terminals * returns.cpu() td_error = rewards + hyperparameters['discount_rate'] * terminals * next_value.detach().cpu() - value.detach().cpu() advantages = advantages * hyperparameters['tau'] * hyperparameters['discount_rate'] * terminals + td_error processed_rollout[i] = [states, actions, log_probs, returns, advantages] states, actions, log_probs_old, returns, advantages = map(lambda x: torch.cat(x, dim=0), zip(*processed_rollout)) advantages = (advantages - advantages.mean()) / advantages.std() batcher = Batcher(states.size(0) // hyperparameters['mini_batch_number'], [np.arange(states.size(0))]) for _ in range(hyperparameters['optimization_epochs']): batcher.shuffle() while not batcher.end(): batch_indices = batcher.next_batch()[0] batch_indices = torch.Tensor(batch_indices).long() sampled_states = states[batch_indices] sampled_actions = actions[batch_indices] sampled_log_probs_old = log_probs_old[batch_indices] sampled_returns = returns[batch_indices] sampled_advantages = advantages[batch_indices] _, log_probs, entropy_loss, values = self.network(sampled_states, sampled_actions) ratio = (log_probs - sampled_log_probs_old).exp() obj = ratio * sampled_advantages obj_clipped = ratio.clamp(1.0 - hyperparameters['ppo_clip'], 1.0 + hyperparameters['ppo_clip']) * sampled_advantages policy_loss = -torch.min(obj, obj_clipped).mean(0) - hyperparameters['entropy_coefficent'] * entropy_loss.mean() value_loss = 0.5 * (sampled_returns - values.cpu()).pow(2).mean() self.optimizier.zero_grad() (policy_loss + value_loss).backward() nn.utils.clip_grad_norm_(self.network.parameters(), hyperparameters['gradient_clip']) self.optimizier.step() steps = hyperparameters['rollout_length'] * self.config['environment']['number_of_agents'] self.total_steps += steps