def sample(self, explore=False): self.step += 1 if self._current_observation is None: self._current_observation = self.env.reset() self._current_observation = np.squeeze( self._current_observation).flatten() if explore: action = self.env.action_space.sample() else: action = self.agent.act( np.squeeze(self._current_observation).flatten()) action = np.asarray(action) next_observation, reward, done, info = self.env.step(action) next_observation = np.squeeze(next_observation).flatten() reward = np.squeeze(reward).flatten() action = np.squeeze(action).flatten() done = np.squeeze(done) done = done.astype(np.int8) self._path_length += 1 self._path_return += np.mean(reward) self._total_samples += 1 self.agent.replay_buffer.add_sample( observation=self._current_observation, action=action, reward=reward, terminal=done, next_observation=next_observation, ) self._current_observation = next_observation if np.all(done) or self._path_length >= self._max_path_length: self._max_path_return = np.maximum(self._max_path_return, self._path_return) self._mean_path_return = self._path_return / self._path_length self._last_path_return = self._path_return self._terminal_position = self._current_observation self._current_observation = self.env.reset() self._path_length = 0 self._path_return = np.zeros(1) self._n_episodes += 1 # FIXME : delete it afterwards. if explore is False: self.episode_rewards.append(self._last_path_return.item()) self.episode_positions.append([ self._terminal_position[0].item(), self._terminal_position[1].item(), ]) self.log_diagnostics() logger.log(tabular) logger.dump_all() else: self._current_observation = next_observation
def sample(self, explore=False): self.step += 1 if self._current_observation_n is None: self._current_observation_n = self.env.reset() action_n = [] if explore: action_n = self.env.action_spaces.sample() else: for agent, current_observation in zip(self.agents, self._current_observation_n): action = agent.act(current_observation.astype(np.float32)) action_n.append(np.array(action)) action_n = np.asarray(action_n) next_observation_n, reward_n, done_n, info = self.env.step(action_n) infoif = False if infoif: action_n = info["new_act"] if self._global_reward: reward_n = np.array([np.sum(reward_n)] * self.agent_num) self._path_length += 1 self._path_return += np.array(reward_n, dtype=np.float32) self._total_samples += 1 for i, agent in enumerate(self.agents): opponent_action = action_n[[ j for j in range(len(action_n)) if j != i ]].flatten() agent.replay_buffer.add_sample( observation=self._current_observation_n[i].astype(np.float32), action=action_n[i].astype(np.float32), reward=reward_n[i].astype(np.float32), terminal=done_n[i], next_observation=next_observation_n[i].astype(np.float32), opponent_action=opponent_action.astype(np.float32), ) self._current_observation_n = next_observation_n if np.all(done_n) or self._path_length >= self._max_path_length: self._current_observation_n = self.env.reset() self._max_path_return = np.maximum(self._max_path_return, self._path_return) self._mean_path_return = self._path_return / self._path_length self._last_path_return = self._path_return self.container["path_rw"].append(self._path_return) self.container["mean_rw"].append(self._mean_path_return) self._path_length = 0 self._path_return = np.zeros(self.agent_num) self._n_episodes += 1 self.log_diagnostics() #for i, agent in enumerate(self.agents): # try: logger.log(tabular) logger.dump_all() else: self._current_observation_n = next_observation_n
def sample(self, explore=False): self.step += 1 if self._current_observation_n is None: self._current_observation_n = self.env.reset() action_n = [] # print(self._current_observation_n) # print(self._current_observation_n.shape) if explore: action_n = self.env.action_spaces.sample() else: for agent, current_observation in zip(self.agents, self._current_observation_n): action = agent.act(current_observation.astype(np.float32)) action_n.append(np.array(action)) action_n = np.asarray(action_n) next_observation_n, reward_n, done_n, info = self.env.step(action_n) # stepping over, this env is the fortAttackGlobalenv if self._global_reward: reward_n = np.array([np.sum(reward_n)] * self.agent_num) self._path_length += 1 self._last_path_return = np.array(reward_n, dtype=np.float32) self._path_return += self._last_path_return self._total_samples += 1 for i, agent in enumerate(self.agents): opponent_action = action_n[[j for j in range(len(action_n)) if j != i]].flatten() agent.replay_buffer.add_sample( observation=self._current_observation_n[i].astype(np.float32), action=action_n[i].astype(np.float32), reward=np.array(reward_n[i],np.float32), terminal=done_n[i], next_observation=next_observation_n[i].astype(np.float32), opponent_action=opponent_action.astype(np.float32) ) if self.render_after!=None: if self._n_episodes % self.render_after == 0: # render(self.env, # "/tmp/episode_%08d" % self._path_length, # self._path_length,) self.env.render(mode="rgb_array")[0] # time.sleep(0.03) self._current_observation_n = next_observation_n if np.all(done_n) or self._path_length >= self._max_path_length: self._max_path_return = np.maximum(self._max_path_return, self._path_return) self._mean_path_return = self._path_return / self._path_length # self._last_path_return = self._path_return # print('last path return', self._path_return) #if self._n_episodes % 100 == 0: # render(self.env, # "/tmp/episode_%08d" % self._path_length, # self._path_length, # True) self._path_length = 0 self._path_return = np.zeros(self.agent_num) self._n_episodes += 1 self.log_diagnostics() # one of these lines is printing to screen logger.log(tabular) logger.dump_all() self._current_observation_n = self.env.reset() else: self._current_observation_n = next_observation_n