def test(self, episodes, render=False, log=False, record=False): self.model.eval() env = self.env if record: env = Monitor(self.env_fn(), directory=os.path.join(self.path, 'recordings'), force=True, video_callable=lambda episode_id: True) with torch.no_grad(): test_rewards = [] total_test_steps = 0 for ep in range(episodes): terminal = False obs_n = env.reset() step = 0 ep_reward = [0 for _ in range(self.model.n_agents)] while not terminal: if render: env.render() torch_obs_n = torch.FloatTensor(obs_n).to( self.device).unsqueeze(0) action_n = self._select_action(self.model, torch_obs_n, explore=False) next_obs_n, reward_n, done_n, info = env.step(action_n) terminal = all(done_n) or step >= self.episode_max_steps obs_n = next_obs_n step += 1 for i, r_n in enumerate(reward_n): ep_reward[i] += r_n total_test_steps += step test_rewards.append(ep_reward) test_rewards = np.array(test_rewards).mean(axis=0) if log: # log - test for i, r_n in enumerate(test_rewards): self.writer.add_scalar('agent_{}/eval_reward'.format(i), r_n, self._step_iter) self.writer.add_scalar('_overall/eval_reward', sum(test_rewards), self._step_iter) self.writer.add_scalar('_overall/test_ep_steps', total_test_steps / episodes, self._step_iter) if record: env.close() return test_rewards
def test(self, episodes, render=False, log=False, record=False): self.model.eval() env = self.env if record: env = Monitor(self.env_fn(), directory=os.path.join(self.path, 'recordings'), force=True, video_callable=lambda episode_id: True) with torch.no_grad(): test_rewards = [] total_test_steps = 0 for ep in range(episodes): terminal = False obs_n = env.reset() step = 0 ep_reward = [0 for _ in range(self.model.n_agents)] self.model.init_hidden(device=self.device) while not terminal: if render: env.render() torch_obs_n = torch.FloatTensor(obs_n).to( self.device).unsqueeze(0) thoughts = [] for agent_i in range(self.model.n_agents): thoughts.append( self.model.agent(agent_i).get_thought( torch_obs_n[:, agent_i])) for i in range(self.share_iter): for agent_i in range(self.model.n_agents): thoughts[agent_i] = (thoughts[agent_i] + thoughts[ (agent_i + 1) % len(thoughts)]) / 2 thoughts = torch.stack(thoughts) action_n = [] for agent_i in range(self.model.n_agents): # assuming every other agent is a neighbour as of now _neighbours = list(range(self.model.n_agents)) _neighbours.remove(agent_i) logits = self.model.agent(agent_i)(thoughts[agent_i]) prob = F.softmax(logits, dim=1) action = prob.argmax(1).item() # action = prob.multinomial(num_samples=1).detach().item() if log and step == 0 and ep == 0: log_prob = F.log_softmax(logits, dim=1) entropy = -(log_prob * prob).sum(1) self.writer.add_scalar( 'agent_{}/entropy'.format(agent_i), entropy, self._step_iter) action_n.append(action) next_obs_n, reward_n, done_n, info = env.step(action_n) terminal = all(done_n) or step >= self.episode_max_steps obs_n = next_obs_n step += 1 for i, r_n in enumerate(reward_n): ep_reward[i] += r_n total_test_steps += step test_rewards.append(ep_reward) test_rewards = np.array(test_rewards).mean(axis=0) # log - test if log: for i, r_n in enumerate(test_rewards): self.writer.add_scalar('agent_{}/eval_reward'.format(i), r_n, self._step_iter) self.writer.add_scalar('_overall/eval_reward', sum(test_rewards), self._step_iter) self.writer.add_scalar('_overall/test_ep_steps', total_test_steps / episodes, self._step_iter) if record: env.close() return test_rewards
description='Interactive Agent for ma-gym') parser.add_argument('--env', default='Checkers-v0', help='Name of the environment (default: %(default)s)') parser.add_argument('--episodes', type=int, default=1, help='episodes (default: %(default)s)') args = parser.parse_args() print( 'Enter the actions space together and press enter ( Eg: \'11<enter>\' which meanes take 1' ' for agent 1 and 1 for agent 2)') env = gym.make('ma_gym:{}'.format(args.env)) env = Monitor(env, directory='recordings', force=True) for ep_i in range(args.episodes): done_n = [False for _ in range(env.n_agents)] ep_reward = 0 obs_n = env.reset() env.render() while not all(done_n): action_n = [int(_) for _ in input('Action:')] obs_n, reward_n, done_n, _ = env.step(action_n) ep_reward += sum(reward_n) env.render() print('Episode #{} Reward: {}'.format(ep_i, ep_reward)) env.close()
state = concat_obs(obs_n) round = 0 cur_round = 0 is_l_hit = False is_r_hit = False while not all(done_n): l_action = LEFT_agent.get_action(state_number, state, evaluation=False) r_action = RIGHT_agent.get_action(state_number, state, evaluation=False) end_round = False next_state, reward_n, done_n, info = env.step( [l_action, r_action]) next_state = concat_obs(next_state) cur_round = info['rounds'] if cur_round != round: is_l_hit = False is_r_hit = False round = cur_round end_round = True paddle_l = np.array([state[0], state[1]]) paddle_r = np.array([state[2], state[3]]) ball = np.array([state[4], state[5]]) dir = np.array(state[6:12]) next_dir = np.array(next_state[6:12])
break print('Episode: {:07d} - Cumulative reward this episode: {}'.format( e, cumulative_reward)) input('End of training. \n\nPress `ENTER` to start testing.') env = Monitor(env, directory="recordings", video_callable=lambda episode_id: True, force=True) obs = env.reset() # while True: for _ in range(num_steps): env.render() state_0, state_1 = get_obs_tuples(obs) a_0_y, b_0_y, b_0_x, d_0, e_0_y, = state_0[0], state_0[1], state_0[ 2], state_0[3], state_0[4] a_1_y, b_1_y, b_1_x, d_1, e_1_y, = state_1[0], state_1[1], state_1[ 2], state_1[3], state_1[4] action_0 = agent_0.get_action_greedy(a_0_y, b_0_y, b_0_x, d_0, e_0_y) action_1 = agent_1.get_action_greedy(a_1_y, b_1_y, b_1_x, d_1, e_1_y) action_1 = env.action_space.sample()[1] action = [] action.append(action_0) action.append(action_1) obs, reward, done, info = env.step(action) if all(done): break env.close()
if __name__ == '__main__': parser = argparse.ArgumentParser(description='Random Agent for ma-gym') parser.add_argument('--env', default='Checkers-v0', help='Name of the environment (default: %(default)s)') parser.add_argument('--episodes', type=int, default=1, help='episodes (default: %(default)s)') args = parser.parse_args() env = gym.make(args.env) env = Monitor(env, directory='recordings/' + args.env, force=True) for ep_i in range(args.episodes): done_n = [False for _ in range(env.n_agents)] ep_reward = 0 env.seed(ep_i) obs_n = env.reset() env.render() while not all(done_n): action_n = env.action_space.sample() obs_n, reward_n, done_n, info = env.step(action_n) ep_reward += sum(reward_n) env.render() print('Episode #{} Reward: {}'.format(ep_i, ep_reward)) env.close()
class Tester(): def __init__(self, models, env_name="PongDuel-v0", render=True, video=True, step_number=1000, log_after_steps=200, log_on_win=True): self._models = models # list of model id to test. If "all", test "all" if "all" in self._models: self._models = [i for i in registered_models["all"]] self._render = render self._video = video self._step_number = step_number self._log_after_steps = log_after_steps self._log_on_win = log_on_win self._env_name = env_name self._env = None def log_score(self, step, score, msg=""): print("Step: {0:05d} Score: {1}".format(step, score), end="") if msg != "": print(" [{}]".format(msg)) else: print("") def run_tests(self): print("Running tests for model IDs: {}".format(self._models)) print("-" * 10) models_score_summary = {} for model_id in self._models: print("Selected model_id: {}".format(model_id)) model = AutoLoadModel(model_id) score = {"agent_0": {"moves": 0, "wins": 0}, "agent_1": {"moves": 0, "wins": 0}} self._env = gym.make(self._env_name) if self._video: if type(model_id) is list: model_id = "{}_VS_{}".format(model_id[0], model_id[1]) output_directory = "recordings/{}".format(model_id) self._env = Monitor(self._env, directory=output_directory, video_callable=lambda episode_id: True, force=True) obs_n = self._env.reset() for _ in range(self._step_number): # render env if self._render: self._env.render() # select actions actions, actions_as_list = model.get_agents_actions(obs_n) # update moves counter for an in ["agent_0", "agent_1"]: if actions[an] in [1, 2]: score[an]["moves"] += 1 # execute actions obs_n, reward_n, done_n, info = self._env.step(actions_as_list) # update score if any(reward_n): score["agent_0"]["wins"] += reward_n[0] score["agent_1"]["wins"] += reward_n[1] if self._log_on_win == True: self.log_score(_, score, "win") models_score_summary[model_id] = score if _ % self._log_after_steps == 0: self.log_score(_, score) if all(done_n): break self.log_score(_, score, "end") print("-" * 10) self._env.close() # Score summary print("Summary:") for k, v in models_score_summary.items(): n_moves = 0 n_wins = 0 print("Model{}:".format(k)) for a, b in v.items(): print(a, b) n_moves += b["moves"] n_wins += b["wins"] print("Average move count: {}".format(n_moves / 2)) print("Total move count: {}".format(n_moves)) print("Total win count: {}".format(n_wins)) print("")
def main(): env = gym.make('PongDuel-v0') env = Monitor(env, directory='testings/PongDuel-v0', force=True) action_dim = env.action_space[0].n state_dim = env.observation_space[0].shape[0] + 2 MAIN_DQN = create_model(state_dim, action_dim, is_dueling=True) TARGET_DQN = create_model(state_dim, action_dim, is_dueling=True) replay_buffer = ReplayBuffer(size=MEM_SIZE, input_shape=INPUT_SHAPE, use_per=USE_PER) agent = Agent(MAIN_DQN, TARGET_DQN, replay_buffer, action_dim, input_shape=INPUT_SHAPE, batch_size=BATCH_SIZE, use_per=USE_PER) if LOAD_FROM is None: state_number = 0 rewards = [] loss_list = [] else: print('Loading from', LOAD_FROM) meta = agent.load(LOAD_FROM, LOAD_REPLAY_BUFFER) state_number = meta['state_number'] rewards = meta['rewards'] loss_list = meta['loss_list'] try: last_50_l = deque(maxlen=50) last_50_r = deque(maxlen=50) recent_10_game = deque(maxlen=10) start_time = datetime.now() print(start_time) for ep in range(MAX_EPISODE_LENGTH): done_n = [False for _ in range(env.n_agents)] l_cnt = 0 r_cnt = 0 state = env.reset() state = concat_obs(state) while not all(done_n): trained_action = agent.get_action(state_number, state, evaluation=False) next_state, reward_n, done_n, _ = env.step([trained_action, random_action()]) next_state = concat_obs(next_state) paddle_l = next_state[0] paddle_r = next_state[2] ball = state[4] delta_l = np.subtract(paddle_l, ball) delta_r = np.subtract(paddle_r, ball) # if reward_n[1] == 1: # if delta_l > 0: # if trained_action == 1: # reward = 1 # else: # reward = -1 # elif delta_l == 0: # if trained_action == 0: # reward = 1 # else: # reward = -1 # else: # if trained_action == 2: # reward = 1 # else: # reward = -1 # elif reward_n[0] == 1: # reward = abs(delta_r)*10 # else: # reward = reward_n[0] agent.add_experience(action=trained_action, state=state, reward=reward_n[0], clip_reward=CLIP_REWARD, done=done_n[0]) state_number += 1 l_reward = reward_n[0] r_reward = reward_n[1] l_cnt += l_reward r_cnt += r_reward if state_number % UPDATE_FREQ == 0 and agent.replay_buffer.count > MIN_REPLAY_BUFFER_SIZE: loss, _ = agent.learn(BATCH_SIZE, gamma=DISCOUNT_FACTOR, state_number=state_number, priority_scale=PRIORITY_SCALE) loss_list.append(loss) if state_number % UPDATE_FREQ == 0 and state_number > MIN_REPLAY_BUFFER_SIZE: agent.update_target_network() state = next_state if l_cnt > r_cnt: is_win = 'Win' recent_10_game.append(1) elif l_cnt == r_cnt: is_win = 'draw' else: is_win = 'Lose' recent_10_game.append(0) last_50_l.append(l_cnt) last_50_r.append(r_cnt) avg_l, avg_r = np.mean(last_50_l), np.mean(last_50_r) recent_10_win_rate = np.mean(recent_10_game) cur_time = datetime.now() print("{}||Episode #{} {} left: {} right: {} / avg score {}:{} / recent 10 game win rate: {}".format(cur_time, ep, is_win, l_cnt, r_cnt, avg_l, avg_r, recent_10_win_rate)) SAVE_PATH = 'PongDuel-saves' print('\nTraining end.') if SAVE_PATH is None: try: SAVE_PATH = input( 'Would you like to save the trained model? If so, type in a save path, otherwise, interrupt with ' 'ctrl+c. ') except KeyboardInterrupt: print('\nExiting...') if SAVE_PATH is not None: print('Saving...') agent.save(f'{SAVE_PATH}/save-{str(state_number).zfill(8)}', state_number=state_number, rewards=rewards, loss_list=loss_list) print('Saved.') except KeyboardInterrupt: SAVE_PATH = 'PongDuel-saves' print('\nTraining exited early.') if SAVE_PATH is None: try: SAVE_PATH = input( 'Would you like to save the trained model? If so, type in a save path, otherwise, interrupt with ' 'ctrl+c. ') except KeyboardInterrupt: print('\nExiting...') if SAVE_PATH is not None: print('Saving...') agent.save(f'{SAVE_PATH}/save-{str(state_number).zfill(8)}', state_number=state_number, rewards=rewards, loss_list=loss_list) print('Saved.')