def sample(self, explore=False): self.step += 1 if self._current_observation is None: self._current_observation = self.env.reset() self._current_observation = np.squeeze( self._current_observation).flatten() if explore: action = self.env.action_space.sample() else: action = self.agent.act( np.squeeze(self._current_observation).flatten()) action = np.asarray(action) next_observation, reward, done, info = self.env.step(action) next_observation = np.squeeze(next_observation).flatten() reward = np.squeeze(reward).flatten() action = np.squeeze(action).flatten() done = np.squeeze(done) done = done.astype(np.int8) self._path_length += 1 self._path_return += np.mean(reward) self._total_samples += 1 self.agent.replay_buffer.add_sample( observation=self._current_observation, action=action, reward=reward, terminal=done, next_observation=next_observation, ) self._current_observation = next_observation if np.all(done) or self._path_length >= self._max_path_length: self._max_path_return = np.maximum(self._max_path_return, self._path_return) self._mean_path_return = self._path_return / self._path_length self._last_path_return = self._path_return self._terminal_position = self._current_observation self._current_observation = self.env.reset() self._path_length = 0 self._path_return = np.zeros(1) self._n_episodes += 1 # FIXME : delete it afterwards. if explore is False: self.episode_rewards.append(self._last_path_return.item()) self.episode_positions.append([ self._terminal_position[0].item(), self._terminal_position[1].item(), ]) self.log_diagnostics() logger.log(tabular) logger.dump_all() else: self._current_observation = next_observation
def sample(self, explore=False): self.step += 1 if self._current_observation_n is None: self._current_observation_n = self.env.reset() action_n = [] if explore: action_n = self.env.action_spaces.sample() else: for agent, current_observation in zip(self.agents, self._current_observation_n): action = agent.act(current_observation.astype(np.float32)) action_n.append(np.array(action)) action_n = np.asarray(action_n) next_observation_n, reward_n, done_n, info = self.env.step(action_n) infoif = False if infoif: action_n = info["new_act"] if self._global_reward: reward_n = np.array([np.sum(reward_n)] * self.agent_num) self._path_length += 1 self._path_return += np.array(reward_n, dtype=np.float32) self._total_samples += 1 for i, agent in enumerate(self.agents): opponent_action = action_n[[ j for j in range(len(action_n)) if j != i ]].flatten() agent.replay_buffer.add_sample( observation=self._current_observation_n[i].astype(np.float32), action=action_n[i].astype(np.float32), reward=reward_n[i].astype(np.float32), terminal=done_n[i], next_observation=next_observation_n[i].astype(np.float32), opponent_action=opponent_action.astype(np.float32), ) self._current_observation_n = next_observation_n if np.all(done_n) or self._path_length >= self._max_path_length: self._current_observation_n = self.env.reset() self._max_path_return = np.maximum(self._max_path_return, self._path_return) self._mean_path_return = self._path_return / self._path_length self._last_path_return = self._path_return self.container["path_rw"].append(self._path_return) self.container["mean_rw"].append(self._mean_path_return) self._path_length = 0 self._path_return = np.zeros(self.agent_num) self._n_episodes += 1 self.log_diagnostics() #for i, agent in enumerate(self.agents): # try: logger.log(tabular) logger.dump_all() else: self._current_observation_n = next_observation_n
def _train(self, batch, weights=None): prior_loss = self._prior_train(batch) opponent_policy_loss = self._opponent_train(batch) critic_loss = self._critic_train(batch, weights) actor_loss = self._actor_train(batch, weights) self._train_step += 1 if self._train_step % self._target_update_period == 0: self._update_target() losses = { "pg_loss": actor_loss.numpy(), "critic_loss": critic_loss.numpy(), "opponent_policy_loss": opponent_policy_loss.numpy(), "prior_loss": prior_loss.numpy(), } if self._train_step % 1 == 0: tabular.record("q loss", critic_loss.numpy().item()) tabular.record( "opponent_policy_loss loss", opponent_policy_loss.numpy().item() ) tabular.record("actor_loss loss", actor_loss.numpy().item()) tabular.record("bi", batch["annealing"].numpy()) tabular.record("bj", 1.0) if prior_loss is not None: tabular.record("prior loss", prior_loss.numpy()) logger.log(tabular) if self._train_step % 100 == 0: # print('training statistics') # print(self._opponent_policy.get_diagnostics(batch['observations'])) # print(self._prior.get_diagnostics(batch['observations'])) opponent_actions = self._opponent_policy.get_actions_np( batch["observations"] ) # print(self._policy.get_diagnostics([batch['observations'], opponent_actions])) actions = self._policy.get_actions_np( [batch["observations"], opponent_actions] ) # print(self._qf.get_diagnostics([batch['observations'], actions, opponent_actions])) return losses
def _train(self, batch, weights=None): prior_loss = self._prior_train(batch) opponent_policy_loss = self._opponent_train(batch) critic_loss = self._critic_train(batch, weights) actor_loss = self._actor_train(batch, weights) self._train_step += 1 if self._train_step % self._target_update_period == 0: self._update_target() losses = { 'pg_loss': actor_loss.numpy(), 'critic_loss': critic_loss.numpy(), 'opponent_policy_loss': opponent_policy_loss.numpy(), 'prior_loss': prior_loss.numpy() } if self._train_step % 1 == 0: tabular.record('q loss', critic_loss.numpy().item()) tabular.record('opponent_policy_loss loss', opponent_policy_loss.numpy().item()) tabular.record('actor_loss loss', actor_loss.numpy().item()) tabular.record('bi', batch['annealing'].numpy()) tabular.record('bj', 1.) if prior_loss is not None: tabular.record('prior loss', prior_loss.numpy()) logger.log(tabular) if self._train_step % 100 == 0: # print('training statistics') # print(self._opponent_policy.get_diagnostics(batch['observations'])) # print(self._prior.get_diagnostics(batch['observations'])) opponent_actions = self._opponent_policy.get_actions_np( batch['observations']) # print(self._policy.get_diagnostics([batch['observations'], opponent_actions])) actions = self._policy.get_actions_np( [batch['observations'], opponent_actions]) # print(self._qf.get_diagnostics([batch['observations'], actions, opponent_actions])) return losses
def sample(self, explore=False): self.step += 1 if self._current_observation_n is None: self._current_observation_n = self.env.reset() action_n = [] # print(self._current_observation_n) # print(self._current_observation_n.shape) if explore: action_n = self.env.action_spaces.sample() else: for agent, current_observation in zip(self.agents, self._current_observation_n): action = agent.act(current_observation.astype(np.float32)) action_n.append(np.array(action)) action_n = np.asarray(action_n) next_observation_n, reward_n, done_n, info = self.env.step(action_n) # stepping over, this env is the fortAttackGlobalenv if self._global_reward: reward_n = np.array([np.sum(reward_n)] * self.agent_num) self._path_length += 1 self._last_path_return = np.array(reward_n, dtype=np.float32) self._path_return += self._last_path_return self._total_samples += 1 for i, agent in enumerate(self.agents): opponent_action = action_n[[j for j in range(len(action_n)) if j != i]].flatten() agent.replay_buffer.add_sample( observation=self._current_observation_n[i].astype(np.float32), action=action_n[i].astype(np.float32), reward=np.array(reward_n[i],np.float32), terminal=done_n[i], next_observation=next_observation_n[i].astype(np.float32), opponent_action=opponent_action.astype(np.float32) ) if self.render_after!=None: if self._n_episodes % self.render_after == 0: # render(self.env, # "/tmp/episode_%08d" % self._path_length, # self._path_length,) self.env.render(mode="rgb_array")[0] # time.sleep(0.03) self._current_observation_n = next_observation_n if np.all(done_n) or self._path_length >= self._max_path_length: self._max_path_return = np.maximum(self._max_path_return, self._path_return) self._mean_path_return = self._path_return / self._path_length # self._last_path_return = self._path_return # print('last path return', self._path_return) #if self._n_episodes % 100 == 0: # render(self.env, # "/tmp/episode_%08d" % self._path_length, # self._path_length, # True) self._path_length = 0 self._path_return = np.zeros(self.agent_num) self._n_episodes += 1 self.log_diagnostics() # one of these lines is printing to screen logger.log(tabular) logger.dump_all() self._current_observation_n = self.env.reset() else: self._current_observation_n = next_observation_n