def arrange_match(self):
        self.episode_count = self.episode_count + 1
        brain_name = self.env.brain_names[0]
        env_info = self.env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations

        episode_reward1 = 0
        episode_reward2 = 0

        for step in range(self.max_steps):
            actions = [self.agent.get_action(state[0]), self.agent.get_action(state[1])]
            env_info = self.env.step(actions)[brain_name]
            reward = env_info.rewards
            next_state = env_info.vector_observations
            done = env_info.local_done
            self.agent.learn_experience(buffer.Experience(state[0], actions[0], reward[0], next_state[0], done[0]))
            self.agent.learn_experience(buffer.Experience(state[1], actions[1], reward[1], next_state[1], done[1]))

            episode_reward1 += reward[0]
            episode_reward2 += reward[1]

            if done[0] or done[1] or step == self.max_steps-1:
                break

            state = next_state

        return max(episode_reward1, episode_reward2), step
Exemple #2
0
    def learn_episode(self, max_steps):
        brain_name = self.env.brain_names[0]
        env_info = self.env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]

        episode_reward = 0

        for step in range(max_steps):
            action = self.get_action(state)
            env_info = self.env.step(action)[brain_name]
            reward = env_info.rewards[0]
            next_state = env_info.vector_observations[0]
            done = env_info.local_done[0]
            self.replay_buffer.add(buffer.Experience(state, action, reward, next_state, done))
            episode_reward += reward

            if self.replay_buffer.ready_to_sample():
                self.update()

            if done or step == max_steps-1:
                break

            state = next_state

        return episode_reward
    def play_episode(self):
        brain_name = self.env.brain_names[0]
        env_info = self.env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations

        episode_reward1 = 0
        episode_reward2 = 0

        for step in range(self.max_steps):
            actions = self.agent.get_actions(state)
            env_info = self.env.step(actions)[brain_name]
            reward = env_info.rewards
            next_state = env_info.vector_observations
            done = env_info.local_done
            self.agent.learn_experience(buffer.Experience(state, actions, reward, next_state, done))

            episode_reward1 += reward[0]
            episode_reward2 += reward[1]

            if done[0] or done[1] or step == self.max_steps-1:
                break

            state = next_state

        return max(episode_reward1, episode_reward2), step
    def arrange_match(self):
        self.episode_count = self.episode_count + 1
        if self.steps_till_freeze < 0:
            self.steps_till_freeze = self.freeze_steps
            self.frozen_agents[self.next_freeze_agent].copy_and_freeze(self.agent)
            self.next_freeze_agent = (self.next_freeze_agent + 1) % len(self.frozen_agents)

        brain_name = self.env.brain_names[0]
        env_info = self.env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations

        episode_reward1 = 0
        episode_reward2 = 0

        if random.random() < self.self_play_probability:
            opponent_agent = self.agent
        else:
            opponent_agent = random.choice(self.frozen_agents)

        for step in range(self.max_steps):
            actions = [self.agent.get_action(state[0]), opponent_agent.get_action(state[1])]
            env_infos = self.env.step(actions)
            reward = env_infos[brain_name].rewards
            next_state = env_infos[brain_name].vector_observations
            done = env_infos[brain_name].local_done
            self.agent.learn_experience(buffer.Experience(state[0], actions[0], reward[0], next_state[0], done[0]))
            self.agent.learn_experience(buffer.Experience(state[1], actions[1], reward[1], next_state[1], done[1]))

            episode_reward1 += reward[0]
            episode_reward2 += reward[1]

            if done[0] or done[1] or step == self.max_steps-1:
                break

            self.steps_till_freeze -= 1
            state = next_state

        return max(episode_reward1, episode_reward2), step