def sample_and_update(self): av_loss = 0 # update old model self.network_old.load_state_dict(self.network.state_dict()) for step in tqdm(range(self.policy_update_epochs)): # cf https://github.com/openai/baselines/blob/master/baselines/pposgd/pposgd_simple.py batch_states, batch_actions, batch_rewards, batch_states1, batch_terminals, extra = \ self.replay_memory.sample_and_split(self.batch_size) batch_returns = to_tensor(extra[:, 0]) batch_advantages = to_tensor(extra[:, 1]) batch_states = to_tensor(batch_states) batch_actions = to_tensor(batch_actions) # old probas actions_old, v_pred_old = self.network_old(batch_states.detach()) probs_old = self.network_old.log_prob(batch_actions) # new probabilities actions, v_pred = self.network(batch_states) probs = self.network.log_prob(batch_actions) # ratio ratio = probs / (1e-15 + probs_old) # clip loss surr1 = ratio * tor.stack([batch_advantages] * batch_actions.shape[1], 1) # surrogate from conservative policy iteration surr2 = ratio.clamp(1 - self.epsilon, 1 + self.epsilon) * tor.stack( [batch_advantages] * batch_actions.shape[1], 1) loss_clip = -tor.mean(tor.min(surr1, surr2)) # value loss vfloss1 = (v_pred - batch_returns) ** 2 #v_pred_clipped = v_pred_old + (v_pred - v_pred_old).clamp(-self.epsilon, self.epsilon) vfloss2 = (v_pred - batch_returns) ** 2 loss_value = 0.5 * tor.mean(tor.max(vfloss1, vfloss2)) # also clip value loss # entropy loss_ent = -self.ent_coeff * tor.mean((np.e * probs)* probs) # total total_loss = (loss_clip + loss_value + loss_ent) av_loss += total_loss.data[0] / float(self.num_episodes) # step self.optimizer.zero_grad() # model.zero_grad() total_loss.backward() # print(list(self.network.parameters())[0].grad) self.optimizer.step() self.num_episodes = 0 self.replay_memory.clear()
def gather_experience_and_calc_advantages(self): acc_reward = 0 episode_length = 0 for i in tqdm(range(self.memory_fill_steps)): self.adv_states[i] = self.state action, value = self.network(to_tensor(self.state).view(1,-1)) self.adv_actions[i] = action.data.numpy() self.adv_values[i] = value.data.numpy() env_action = action.data.squeeze().numpy() state, reward, done, _ = self.env.step(env_action) episode_length += 1 acc_reward += reward self.state = state done = done or episode_length >= self.max_episode_len self.adv_terminal[i] = done self._episode_step(state, env_action, reward, done) if done: self.env.reset() self.num_episodes += 1 episode_length = 0 #reward = max(min(reward, 1), -1) self.adv_rewards[i] = reward R = np.zeros((1,1)) if not done: action, value = self.network(to_tensor(state).view(1,-1)) R = value.data.numpy() self.adv_values[i] = R A = np.zeros((1,1)) for j in reversed(range(i)): td = self.adv_rewards[j] + self.gamma * self.adv_values[j + 1] - self.adv_values[j] A = td + self.gamma * self.lmda * A self.adv_advantages[j] = A R = A + self.adv_values[j] self.adv_returns[j] = R self.replay_memory.append(self.adv_states[j], self.adv_actions[j], self.adv_rewards[j], self.adv_terminal[j], [self.adv_returns[j], self.adv_advantages[j]])
def value(self, *args, **kwargs): """ To be called in non-batch call. :param args: List of inputs to the network :return: Output of the network in non-batch shape """ if len(args) > 1: x = np.hstack(args) x = np.expand_dims(x, 0) x = to_tensor(x, **kwargs) out = self.critic_forward(x) self.out = out[0] return out[0] else: x = args[0] x = np.expand_dims(x, 0) x = to_tensor(x, **kwargs) out = self.critic_forward(x) self.out = out[0] return out[0]
def actions(self, *args, **kwargs): """ To be called in a batch call. :param args: List of inputs to the network :return: Output of the network in batch shape """ if len(args) > 1: if len(args) > 1: if not tor.is_tensor(args[0]): x = np.hstack(args) x = to_tensor(x, **kwargs) else: x = tor.cat(args, 1) out = self.forward(x) self.out = out return out else: x = args[0] x = to_tensor(x, **kwargs) out = self.forward(x) self.out = out return out
def forward(self, x): if self.activation_functions: for i, func in enumerate(self.activation_functions): x = func(self.layer_list[i](x)) else: for i, layer in enumerate(self.layer_list[:-1]): x = self.relu(layer(x)) x = self.layer_list[-1](x) self.means = self.tanh(x[:, :int((x.shape[1] - 1))]) # Choose sigmas randomly self.sigmas = to_tensor( np.full((x.shape[0], x.shape[1] - 1), np.e**self.sigma_log)) self.dist = Normal(self.means, self.sigmas) self.value = x[:, -1] self.sampled = self.dist.rsample() x = self.sampled self.out = x return x
def policy_forward(self, x): # Policy network if self.activation_functions: for i, func in enumerate(self.activation_functions): x = func(self.layer_list[i](x)) else: for i, layer in enumerate(self.layer_list[:-1]): x = self.relu(layer(x)) x = self.layer_list[-1](x) self.means = self.tanh(x) self.sigmas = to_tensor( np.full((x.shape[0], x.shape[1]), np.e**self.sigma_log)) self.dist = Normal(self.means, self.sigmas) self.sampled = self.dist.rsample() x = self.sampled return x
def gather_experience_and_calc_advantages(self, network, q, reward_q, max_episode_len, gamma, lmda, env, step, pid, T, sigma_log): np.random.seed(pid) tor.manual_seed(pid) env.seed(pid) state = env.reset() episode_length = 0 acc_reward = 0 adv_states = np.zeros((T, env.observation_space.shape[0])) adv_actions = np.zeros((T, env.action_space.shape[0])) adv_rewards = np.zeros(T) adv_values = np.zeros(T) adv_returns = np.zeros(T) adv_advantages = np.zeros(T) while True: counter = 0 for i in range(T): counter += 1 adv_states[i] = state action, value = network(to_tensor(state).view(1,-1), sigma_log) adv_actions[i] = action.data.numpy() adv_values[i] = value.data.numpy() env_action = action.data.squeeze().numpy() state, reward, done, _ = env.step(np.clip(env_action, -1, 1)) # reward /= 100. # print(reward) self._async_step(state=state, reward=reward) episode_length += 1 acc_reward += reward done = done or episode_length >= max_episode_len # step(state, env_action, reward, done) #reward = max(min(reward, 1), -1) adv_rewards[i] = reward R = np.zeros((1,1)) if not done: R = value.data.numpy() adv_values[i] = R if done: state = env.reset() episode_length = 0 reward_q.put(acc_reward) self._async_episode_step(acc_reward=acc_reward) #print("Acc reward in episode: ", acc_reward) acc_reward = 0 break if done: continue A = np.zeros((1, 1)) for j in reversed(range(counter-1)): td = adv_rewards[j] + gamma * adv_values[j + 1] - adv_values[j] A = td + gamma * lmda * A adv_advantages[j] = A R = A + adv_values[j] adv_returns[j] = R q.put([adv_states[j], adv_actions[j], adv_rewards[j], False, [adv_returns[j], adv_advantages[j]]])
goal_occurances[tuple(env.goal)] = 1 state_and_goal = np.zeros((1, num_bits * 2)) for j in range(episode_length): # Normal step state = env.get_observation() goal = env.goal hgoal = tuple(state) goal_occurances[hgoal] = goal_occurances[ hgoal] + 1 if hgoal in goal_occurances else 1 state_and_goal[0][0:num_bits] = state state_and_goal[0][num_bits::] = goal x = to_tensor(state_and_goal, 0) action_distribution = policy.forward(x) action = policy.sample_action() episode_steps[j] = (state, action) state, reward, done, _ = env.step(action) acc_reward += reward mvr_tracker.append(acc_reward) if i % 200 == 0: print(i, ". Moving Average Reward:", np.mean(mvr_tracker), "Acc reward:", acc_reward)