Exemple #1
0
    def test(self, episodes, render=False, log=False, record=False):
        self.model.eval()
        env = self.env
        if record:
            env = Monitor(self.env_fn(),
                          directory=os.path.join(self.path, 'recordings'),
                          force=True,
                          video_callable=lambda episode_id: True)
        with torch.no_grad():
            test_rewards = []
            total_test_steps = 0
            for ep in range(episodes):
                terminal = False
                obs_n = env.reset()
                step = 0
                ep_reward = [0 for _ in range(self.model.n_agents)]
                while not terminal:
                    if render:
                        env.render()

                    torch_obs_n = torch.FloatTensor(obs_n).to(
                        self.device).unsqueeze(0)
                    action_n = self._select_action(self.model,
                                                   torch_obs_n,
                                                   explore=False)

                    next_obs_n, reward_n, done_n, info = env.step(action_n)
                    terminal = all(done_n) or step >= self.episode_max_steps

                    obs_n = next_obs_n
                    step += 1
                    for i, r_n in enumerate(reward_n):
                        ep_reward[i] += r_n

                total_test_steps += step
                test_rewards.append(ep_reward)

            test_rewards = np.array(test_rewards).mean(axis=0)
            if log:
                # log - test
                for i, r_n in enumerate(test_rewards):
                    self.writer.add_scalar('agent_{}/eval_reward'.format(i),
                                           r_n, self._step_iter)
                self.writer.add_scalar('_overall/eval_reward',
                                       sum(test_rewards), self._step_iter)
                self.writer.add_scalar('_overall/test_ep_steps',
                                       total_test_steps / episodes,
                                       self._step_iter)
        if record:
            env.close()

        return test_rewards
Exemple #2
0
    def test(self, episodes, render=False, log=False, record=False):
        self.model.eval()

        env = self.env
        if record:
            env = Monitor(self.env_fn(),
                          directory=os.path.join(self.path, 'recordings'),
                          force=True,
                          video_callable=lambda episode_id: True)
        with torch.no_grad():
            test_rewards = []
            total_test_steps = 0
            for ep in range(episodes):
                terminal = False
                obs_n = env.reset()
                step = 0
                ep_reward = [0 for _ in range(self.model.n_agents)]

                self.model.init_hidden(device=self.device)
                while not terminal:
                    if render:
                        env.render()

                    torch_obs_n = torch.FloatTensor(obs_n).to(
                        self.device).unsqueeze(0)

                    thoughts = []
                    for agent_i in range(self.model.n_agents):
                        thoughts.append(
                            self.model.agent(agent_i).get_thought(
                                torch_obs_n[:, agent_i]))

                    for i in range(self.share_iter):
                        for agent_i in range(self.model.n_agents):
                            thoughts[agent_i] = (thoughts[agent_i] + thoughts[
                                (agent_i + 1) % len(thoughts)]) / 2
                    thoughts = torch.stack(thoughts)

                    action_n = []
                    for agent_i in range(self.model.n_agents):
                        # assuming every other agent is a neighbour as of now
                        _neighbours = list(range(self.model.n_agents))
                        _neighbours.remove(agent_i)

                        logits = self.model.agent(agent_i)(thoughts[agent_i])
                        prob = F.softmax(logits, dim=1)
                        action = prob.argmax(1).item()
                        # action = prob.multinomial(num_samples=1).detach().item()

                        if log and step == 0 and ep == 0:
                            log_prob = F.log_softmax(logits, dim=1)
                            entropy = -(log_prob * prob).sum(1)
                            self.writer.add_scalar(
                                'agent_{}/entropy'.format(agent_i), entropy,
                                self._step_iter)

                        action_n.append(action)

                    next_obs_n, reward_n, done_n, info = env.step(action_n)
                    terminal = all(done_n) or step >= self.episode_max_steps

                    obs_n = next_obs_n
                    step += 1
                    for i, r_n in enumerate(reward_n):
                        ep_reward[i] += r_n

                total_test_steps += step
                test_rewards.append(ep_reward)

            test_rewards = np.array(test_rewards).mean(axis=0)

            # log - test
            if log:
                for i, r_n in enumerate(test_rewards):
                    self.writer.add_scalar('agent_{}/eval_reward'.format(i),
                                           r_n, self._step_iter)
                self.writer.add_scalar('_overall/eval_reward',
                                       sum(test_rewards), self._step_iter)
                self.writer.add_scalar('_overall/test_ep_steps',
                                       total_test_steps / episodes,
                                       self._step_iter)

        if record:
            env.close()
        return test_rewards
        description='Interactive Agent for ma-gym')
    parser.add_argument('--env',
                        default='Checkers-v0',
                        help='Name of the environment (default: %(default)s)')
    parser.add_argument('--episodes',
                        type=int,
                        default=1,
                        help='episodes (default: %(default)s)')
    args = parser.parse_args()

    print(
        'Enter the actions space together and press enter ( Eg: \'11<enter>\' which meanes take 1'
        ' for agent 1 and 1 for agent 2)')

    env = gym.make('ma_gym:{}'.format(args.env))
    env = Monitor(env, directory='recordings', force=True)
    for ep_i in range(args.episodes):
        done_n = [False for _ in range(env.n_agents)]
        ep_reward = 0

        obs_n = env.reset()
        env.render()
        while not all(done_n):
            action_n = [int(_) for _ in input('Action:')]
            obs_n, reward_n, done_n, _ = env.step(action_n)
            ep_reward += sum(reward_n)
            env.render()

        print('Episode #{} Reward: {}'.format(ep_i, ep_reward))
    env.close()
Exemple #4
0
            state = concat_obs(obs_n)
            round = 0
            cur_round = 0

            is_l_hit = False
            is_r_hit = False

            while not all(done_n):
                l_action = LEFT_agent.get_action(state_number,
                                                 state,
                                                 evaluation=False)
                r_action = RIGHT_agent.get_action(state_number,
                                                  state,
                                                  evaluation=False)
                end_round = False
                next_state, reward_n, done_n, info = env.step(
                    [l_action, r_action])
                next_state = concat_obs(next_state)
                cur_round = info['rounds']

                if cur_round != round:
                    is_l_hit = False
                    is_r_hit = False
                    round = cur_round
                    end_round = True

                paddle_l = np.array([state[0], state[1]])
                paddle_r = np.array([state[2], state[3]])
                ball = np.array([state[4], state[5]])
                dir = np.array(state[6:12])
                next_dir = np.array(next_state[6:12])
Exemple #5
0
                break
        print('Episode: {:07d} - Cumulative reward this episode: {}'.format(
            e, cumulative_reward))

    input('End of training. \n\nPress `ENTER` to start testing.')

    env = Monitor(env,
                  directory="recordings",
                  video_callable=lambda episode_id: True,
                  force=True)
    obs = env.reset()
    # while True:
    for _ in range(num_steps):
        env.render()
        state_0, state_1 = get_obs_tuples(obs)
        a_0_y, b_0_y, b_0_x, d_0, e_0_y, = state_0[0], state_0[1], state_0[
            2], state_0[3], state_0[4]
        a_1_y, b_1_y, b_1_x, d_1, e_1_y, = state_1[0], state_1[1], state_1[
            2], state_1[3], state_1[4]
        action_0 = agent_0.get_action_greedy(a_0_y, b_0_y, b_0_x, d_0, e_0_y)
        action_1 = agent_1.get_action_greedy(a_1_y, b_1_y, b_1_x, d_1, e_1_y)
        action_1 = env.action_space.sample()[1]
        action = []
        action.append(action_0)
        action.append(action_1)
        obs, reward, done, info = env.step(action)
        if all(done):
            break

    env.close()
Exemple #6
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Random Agent for ma-gym')
    parser.add_argument('--env',
                        default='Checkers-v0',
                        help='Name of the environment (default: %(default)s)')
    parser.add_argument('--episodes',
                        type=int,
                        default=1,
                        help='episodes (default: %(default)s)')
    args = parser.parse_args()

    env = gym.make(args.env)
    env = Monitor(env, directory='recordings/' + args.env, force=True)
    for ep_i in range(args.episodes):
        done_n = [False for _ in range(env.n_agents)]
        ep_reward = 0

        env.seed(ep_i)
        obs_n = env.reset()
        env.render()

        while not all(done_n):
            action_n = env.action_space.sample()
            obs_n, reward_n, done_n, info = env.step(action_n)
            ep_reward += sum(reward_n)
            env.render()

        print('Episode #{} Reward: {}'.format(ep_i, ep_reward))
    env.close()
class Tester():
    def __init__(self, models, env_name="PongDuel-v0", render=True, video=True, step_number=1000, log_after_steps=200, log_on_win=True):
        self._models = models  # list of model id to test. If "all", test "all"
        if "all" in self._models:
            self._models = [i for i in registered_models["all"]]
        self._render = render
        self._video = video
        self._step_number = step_number
        self._log_after_steps = log_after_steps
        self._log_on_win = log_on_win
        self._env_name = env_name
        self._env = None

    def log_score(self, step, score, msg=""):
        print("Step: {0:05d} Score: {1}".format(step, score), end="")
        if msg != "":
            print(" [{}]".format(msg))
        else:
            print("")

    def run_tests(self):
        print("Running tests for model IDs: {}".format(self._models))
        print("-" * 10)
        models_score_summary = {}

        for model_id in self._models:
            print("Selected model_id: {}".format(model_id))
            model = AutoLoadModel(model_id)

            score = {"agent_0": {"moves": 0,
                                 "wins": 0},
                     "agent_1": {"moves": 0,
                                 "wins": 0}}
            self._env = gym.make(self._env_name)
            if self._video:
                if type(model_id) is list:
                    model_id = "{}_VS_{}".format(model_id[0], model_id[1])
                output_directory = "recordings/{}".format(model_id)
                self._env = Monitor(self._env, directory=output_directory, video_callable=lambda episode_id: True, force=True)
            obs_n = self._env.reset()

            for _ in range(self._step_number):

                # render env
                if self._render:
                    self._env.render()

                # select actions
                actions, actions_as_list = model.get_agents_actions(obs_n)

                # update moves counter
                for an in ["agent_0", "agent_1"]:
                    if actions[an] in [1, 2]:
                        score[an]["moves"] += 1

                # execute actions
                obs_n, reward_n, done_n, info = self._env.step(actions_as_list)

                # update score
                if any(reward_n):
                    score["agent_0"]["wins"] += reward_n[0]
                    score["agent_1"]["wins"] += reward_n[1]
                    if self._log_on_win == True:
                        self.log_score(_, score, "win")
                models_score_summary[model_id] = score
                if _ % self._log_after_steps == 0:
                    self.log_score(_, score)

                if all(done_n):
                    break

            self.log_score(_, score, "end")
            print("-" * 10)
            self._env.close()

        # Score summary
        print("Summary:")
        for k, v in models_score_summary.items():
            n_moves = 0
            n_wins = 0
            print("Model{}:".format(k))
            for a, b in v.items():
                print(a, b)
                n_moves += b["moves"]
                n_wins += b["wins"]
            print("Average move count: {}".format(n_moves / 2))
            print("Total move count: {}".format(n_moves))
            print("Total win count: {}".format(n_wins))
            print("")
Exemple #8
0
def main():
    env = gym.make('PongDuel-v0')
    env = Monitor(env, directory='testings/PongDuel-v0', force=True)
    action_dim = env.action_space[0].n
    state_dim = env.observation_space[0].shape[0] + 2

    MAIN_DQN = create_model(state_dim, action_dim, is_dueling=True)
    TARGET_DQN = create_model(state_dim, action_dim, is_dueling=True)

    replay_buffer = ReplayBuffer(size=MEM_SIZE, input_shape=INPUT_SHAPE, use_per=USE_PER)
    agent = Agent(MAIN_DQN, TARGET_DQN, replay_buffer, action_dim, input_shape=INPUT_SHAPE, batch_size=BATCH_SIZE,
                  use_per=USE_PER)

    if LOAD_FROM is None:
        state_number = 0
        rewards = []
        loss_list = []
    else:
        print('Loading from', LOAD_FROM)
        meta = agent.load(LOAD_FROM, LOAD_REPLAY_BUFFER)

        state_number = meta['state_number']
        rewards = meta['rewards']
        loss_list = meta['loss_list']

    try:
        last_50_l = deque(maxlen=50)
        last_50_r = deque(maxlen=50)
        recent_10_game = deque(maxlen=10)

        start_time = datetime.now()
        print(start_time)
        for ep in range(MAX_EPISODE_LENGTH):
            done_n = [False for _ in range(env.n_agents)]
            l_cnt = 0
            r_cnt = 0
            state = env.reset()
            state = concat_obs(state)
            while not all(done_n):
                trained_action = agent.get_action(state_number, state, evaluation=False)

                next_state, reward_n, done_n, _ = env.step([trained_action, random_action()])
                next_state = concat_obs(next_state)

                paddle_l = next_state[0]
                paddle_r = next_state[2]
                ball = state[4]

                delta_l = np.subtract(paddle_l, ball)
                delta_r = np.subtract(paddle_r, ball)

                # if reward_n[1] == 1:
                #     if delta_l > 0:
                #         if trained_action == 1:
                #             reward = 1
                #         else:
                #             reward = -1
                #     elif delta_l == 0:
                #         if trained_action == 0:
                #             reward = 1
                #         else:
                #             reward = -1
                #     else:
                #         if trained_action == 2:
                #             reward = 1
                #         else:
                #             reward = -1
                # elif reward_n[0] == 1:
                #     reward = abs(delta_r)*10
                # else:
                #     reward = reward_n[0]



                agent.add_experience(action=trained_action, state=state, reward=reward_n[0], clip_reward=CLIP_REWARD,
                                     done=done_n[0])

                state_number += 1
                l_reward = reward_n[0]
                r_reward = reward_n[1]
                l_cnt += l_reward
                r_cnt += r_reward

                if state_number % UPDATE_FREQ == 0 and agent.replay_buffer.count > MIN_REPLAY_BUFFER_SIZE:
                    loss, _ = agent.learn(BATCH_SIZE, gamma=DISCOUNT_FACTOR, state_number=state_number,
                                          priority_scale=PRIORITY_SCALE)
                    loss_list.append(loss)

                if state_number % UPDATE_FREQ == 0 and state_number > MIN_REPLAY_BUFFER_SIZE:
                    agent.update_target_network()

                state = next_state

            if l_cnt > r_cnt:
                is_win = 'Win'
                recent_10_game.append(1)
            elif l_cnt == r_cnt:
                is_win = 'draw'
            else:
                is_win = 'Lose'
                recent_10_game.append(0)

            last_50_l.append(l_cnt)
            last_50_r.append(r_cnt)
            avg_l, avg_r = np.mean(last_50_l), np.mean(last_50_r)
            recent_10_win_rate = np.mean(recent_10_game)
            cur_time = datetime.now()
            print("{}||Episode #{} {} left: {} right: {} / avg score {}:{} / recent 10 game win rate: {}".format(cur_time, ep, is_win, l_cnt, r_cnt, avg_l, avg_r, recent_10_win_rate))

        SAVE_PATH = 'PongDuel-saves'
        print('\nTraining end.')
        if SAVE_PATH is None:
            try:
                SAVE_PATH = input(
                    'Would you like to save the trained model? If so, type in a save path, otherwise, interrupt with '
                    'ctrl+c. ')
            except KeyboardInterrupt:
                print('\nExiting...')
        if SAVE_PATH is not None:
            print('Saving...')
            agent.save(f'{SAVE_PATH}/save-{str(state_number).zfill(8)}', state_number=state_number, rewards=rewards,
                       loss_list=loss_list)
            print('Saved.')

    except KeyboardInterrupt:
        SAVE_PATH = 'PongDuel-saves'
        print('\nTraining exited early.')
        if SAVE_PATH is None:
            try:
                SAVE_PATH = input(
                    'Would you like to save the trained model? If so, type in a save path, otherwise, interrupt with '
                    'ctrl+c. ')
            except KeyboardInterrupt:
                print('\nExiting...')
        if SAVE_PATH is not None:
            print('Saving...')
            agent.save(f'{SAVE_PATH}/save-{str(state_number).zfill(8)}', state_number=state_number, rewards=rewards,
                       loss_list=loss_list)
            print('Saved.')