예제 #1
0
class TD3Agent:

    MAX_EXPERIENCES = 30000

    MIN_EXPERIENCES = 300

    ENV_ID = "Pendulum-v0"

    ACTION_SPACE = 1

    MAX_ACTION = 2

    OBSERVATION_SPACE = 3

    CRITIC_UPDATE_PERIOD = 4

    POLICY_UPDATE_PERIOD = 8

    TAU = 0.02

    GAMMA = 0.99

    BATCH_SIZE = 64

    NOISE_STDDEV = 0.2

    def __init__(self):

        self.env = gym.make(self.ENV_ID)

        self.env.max_episode_steps = 3000

        self.actor = ActorNetwork(action_space=self.ACTION_SPACE,
                                  max_action=self.MAX_ACTION)

        self.target_actor = ActorNetwork(action_space=self.ACTION_SPACE,
                                         max_action=self.MAX_ACTION)

        self.critic = CriticNetwork()

        self.target_critic = CriticNetwork()

        self.buffer = ReplayBuffer(max_experiences=self.MAX_EXPERIENCES)

        self.global_steps = 0

        self.hiscore = None

        self._build_networks()

    def _build_networks(self):
        """パラメータの初期化
        """

        dummy_state = np.random.normal(0, 0.1, size=self.OBSERVATION_SPACE)
        dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32)

        dummy_action = np.random.normal(0, 0.1, size=self.ACTION_SPACE)
        dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32)

        self.actor.call(dummy_state)
        self.target_actor.call(dummy_state)
        self.target_actor.set_weights(self.actor.get_weights())

        self.critic.call(dummy_state, dummy_action, training=False)
        self.target_critic.call(dummy_state, dummy_action, training=False)
        self.target_critic.set_weights(self.critic.get_weights())

    def play(self, n_episodes):

        total_rewards = []

        recent_scores = collections.deque(maxlen=10)

        for n in range(n_episodes):

            total_reward, localsteps = self.play_episode()

            total_rewards.append(total_reward)

            recent_scores.append(total_reward)

            recent_average_score = sum(recent_scores) / len(recent_scores)

            print(f"Episode {n}: {total_reward}")
            print(f"Local steps {localsteps}")
            print(f"Experiences {len(self.buffer)}")
            print(f"Global step {self.global_steps}")
            print(f"Noise stdev {self.NOISE_STDDEV}")
            print(f"recent average score {recent_average_score}")
            print()

            if (self.hiscore is None) or (recent_average_score > self.hiscore):
                self.hiscore = recent_average_score
                print(f"HISCORE Updated: {self.hiscore}")
                self.save_model()

        return total_rewards

    def play_episode(self):

        total_reward = 0

        steps = 0

        done = False

        state = self.env.reset()

        while not done:

            action = self.actor.sample_action(state, noise=self.NOISE_STDDEV)

            next_state, reward, done, _ = self.env.step(action)

            exp = Experience(state, action, reward, next_state, done)

            self.buffer.add_experience(exp)

            state = next_state

            total_reward += reward

            steps += 1

            self.global_steps += 1

            #: Delayed Policy update
            if self.global_steps % self.CRITIC_UPDATE_PERIOD == 0:
                if self.global_steps % self.POLICY_UPDATE_PERIOD == 0:
                    self.update_network(self.BATCH_SIZE, update_policy=True)
                    self.update_target_network()
                else:
                    self.update_network(self.BATCH_SIZE)

        return total_reward, steps

    def update_network(self, batch_size, update_policy=False):

        if len(self.buffer) < self.MIN_EXPERIENCES:
            return

        (states, actions, rewards, next_states,
         dones) = self.buffer.get_minibatch(batch_size)

        clipped_noise = np.clip(np.random.normal(0, 0.2, self.ACTION_SPACE),
                                -0.5, 0.5)

        next_actions = self.target_actor(
            next_states) + clipped_noise * self.MAX_ACTION

        q1, q2 = self.target_critic(next_states, next_actions)

        next_qvalues = [
            min(q1, q2) for q1, q2 in zip(q1.numpy().flatten(),
                                          q2.numpy().flatten())
        ]

        #: Compute taeget values and update CriticNetwork
        target_values = np.vstack([
            reward + self.GAMMA * next_qvalue if not done else reward
            for reward, done, next_qvalue in zip(rewards, dones, next_qvalues)
        ]).astype(np.float32)

        #: Update Critic
        with tf.GradientTape() as tape:
            q1, q2 = self.critic(states, actions)
            loss1 = tf.reduce_mean(tf.square(target_values - q1))
            loss2 = tf.reduce_mean(tf.square(target_values - q2))
            loss = loss1 + loss2

        variables = self.critic.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.critic.optimizer.apply_gradients(zip(gradients, variables))

        #: Delayed Update ActorNetwork
        if update_policy:

            with tf.GradientTape() as tape:
                q1, _ = self.critic(states, self.actor(states))
                J = -1 * tf.reduce_mean(q1)

            variables = self.actor.trainable_variables
            gradients = tape.gradient(J, variables)
            self.actor.optimizer.apply_gradients(zip(gradients, variables))

    def update_target_network(self):

        # soft-target update Actor
        target_actor_weights = self.target_actor.get_weights()
        actor_weights = self.actor.get_weights()

        assert len(target_actor_weights) == len(actor_weights)

        self.target_actor.set_weights((1 - self.TAU) *
                                      np.array(target_actor_weights) +
                                      (self.TAU) * np.array(actor_weights))

        # soft-target update Critic
        target_critic_weights = self.target_critic.get_weights()
        critic_weights = self.critic.get_weights()

        assert len(target_critic_weights) == len(critic_weights)

        self.target_critic.set_weights((1 - self.TAU) *
                                       np.array(target_critic_weights) +
                                       (self.TAU) * np.array(critic_weights))

    def save_model(self):

        self.actor.save_weights("checkpoints/actor")

        self.critic.save_weights("checkpoints/critic")

    def load_model(self):

        self.actor.load_weights("checkpoints/actor")

        self.target_actor.load_weights("checkpoints/actor")

        self.critic.load_weights("checkpoints/critic")

        self.target_critic.load_weights("checkpoints/critic")

    def test_play(self, n, monitordir, load_model=False):

        if load_model:
            self.load_model()

        if monitordir:
            env = wrappers.Monitor(gym.make(self.ENV_ID),
                                   monitordir,
                                   force=True,
                                   video_callable=(lambda ep: ep % 1 == 0))
        else:
            env = gym.make(self.ENV_ID)

        for i in range(n):

            total_reward = 0

            steps = 0

            done = False

            state = env.reset()

            while not done:

                action = self.actor.sample_action(state, noise=False)

                next_state, reward, done, _ = env.step(action)

                state = next_state

                total_reward += reward

                steps += 1

            print()
            print(f"Test Play {i}: {total_reward}")
            print(f"Steps:", steps)
            print()
class DDPGAgent:

    MAX_EXPERIENCES = 30000

    MIN_EXPERIENCES = 300

    ENV_ID = "Pendulum-v0"

    ACTION_SPACE = 1

    OBSERVATION_SPACE = 3

    UPDATE_PERIOD = 4

    START_EPISODES = 20

    TAU = 0.02

    GAMMA = 0.99

    BATCH_SIZE = 32

    def __init__(self):

        self.env = gym.make(self.ENV_ID)

        self.env.max_episode_steps = 1000

        self.actor_network = ActorNetwork(action_space=self.ACTION_SPACE)

        self.target_actor_network = ActorNetwork(
            action_space=self.ACTION_SPACE)

        self.critic_network = CriticNetwork()

        self.target_critic_network = CriticNetwork()

        self.stdev = 0.2

        self.buffer = ReplayBuffer(max_experiences=self.MAX_EXPERIENCES)

        self.global_steps = 0

        self.hiscore = None

        self._build_networks()

    def _build_networks(self):
        """パラメータの初期化
        """

        dummy_state = np.random.normal(0, 0.1, size=self.OBSERVATION_SPACE)
        dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32)

        dummy_action = np.random.normal(0, 0.1, size=self.ACTION_SPACE)
        dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32)

        self.actor_network.call(dummy_state)
        self.target_actor_network.call(dummy_state)
        self.target_actor_network.set_weights(self.actor_network.get_weights())

        self.critic_network.call(dummy_state, dummy_action, training=False)
        self.target_critic_network.call(dummy_state,
                                        dummy_action,
                                        training=False)
        self.target_critic_network.set_weights(
            self.critic_network.get_weights())

    def play(self, n_episodes):

        total_rewards = []

        recent_scores = collections.deque(maxlen=10)

        for n in range(n_episodes):

            if n <= self.START_EPISODES:
                total_reward, localsteps = self.play_episode(random=True)
            else:
                total_reward, localsteps = self.play_episode()

            total_rewards.append(total_reward)

            recent_scores.append(total_reward)

            recent_average_score = sum(recent_scores) / len(recent_scores)

            print(f"Episode {n}: {total_reward}")
            print(f"Local steps {localsteps}")
            print(f"Experiences {len(self.buffer)}")
            print(f"Global step {self.global_steps}")
            print(f"Noise stdev {self.stdev}")
            print(f"recent average score {recent_average_score}")
            print()

            if (self.hiscore is None) or (recent_average_score > self.hiscore):
                self.hiscore = recent_average_score
                print(f"HISCORE Updated: {self.hiscore}")
                self.save_model()

        return total_rewards

    def play_episode(self, random=False):

        total_reward = 0

        steps = 0

        done = False

        state = self.env.reset()

        while not done:

            if random:
                action = np.random.uniform(-2, 2, size=self.ACTION_SPACE)
            else:
                action = self.actor_network.sample_action(state,
                                                          noise=self.stdev)

            next_state, reward, done, _ = self.env.step(action)

            exp = Experience(state, action, reward, next_state, done)

            self.buffer.add_experience(exp)

            state = next_state

            total_reward += reward

            steps += 1

            self.global_steps += 1

            if self.global_steps % self.UPDATE_PERIOD == 0:
                self.update_network(self.BATCH_SIZE)
                self.update_target_network()

        return total_reward, steps

    def update_network(self, batch_size):

        if len(self.buffer) < self.MIN_EXPERIENCES:
            return

        (states, actions, rewards, next_states,
         dones) = self.buffer.get_minibatch(batch_size)

        next_actions = self.target_actor_network(next_states)

        next_qvalues = self.target_critic_network(
            next_states, next_actions).numpy().flatten()

        #: Compute taeget values and update CriticNetwork
        target_values = np.vstack([
            reward + self.GAMMA * next_qvalue if not done else reward
            for reward, done, next_qvalue in zip(rewards, dones, next_qvalues)
        ]).astype(np.float32)

        with tf.GradientTape() as tape:
            qvalues = self.critic_network(states, actions)
            loss = tf.reduce_mean(tf.square(target_values - qvalues))

        variables = self.critic_network.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.critic_network.optimizer.apply_gradients(zip(
            gradients, variables))

        #: Update ActorNetwork
        with tf.GradientTape() as tape:
            J = -1 * tf.reduce_mean(
                self.critic_network(states, self.actor_network(states)))

        variables = self.actor_network.trainable_variables
        gradients = tape.gradient(J, variables)
        self.actor_network.optimizer.apply_gradients(zip(gradients, variables))

    def update_target_network(self):

        # soft-target update Actor
        target_actor_weights = self.target_actor_network.get_weights()
        actor_weights = self.actor_network.get_weights()

        assert len(target_actor_weights) == len(actor_weights)

        self.target_actor_network.set_weights(
            (1 - self.TAU) * np.array(target_actor_weights) +
            (self.TAU) * np.array(actor_weights))

        # soft-target update Critic
        target_critic_weights = self.target_critic_network.get_weights()
        critic_weights = self.critic_network.get_weights()

        assert len(target_critic_weights) == len(critic_weights)

        self.target_critic_network.set_weights(
            (1 - self.TAU) * np.array(target_critic_weights) +
            (self.TAU) * np.array(critic_weights))

    def save_model(self):

        self.actor_network.save_weights("checkpoints/actor")

        self.critic_network.save_weights("checkpoints/critic")

    def load_model(self):

        self.actor_network.load_weights("checkpoints/actor")

        self.target_actor_network.load_weights("checkpoints/actor")

        self.critic_network.load_weights("checkpoints/critic")

        self.target_critic_network.load_weights("checkpoints/critic")

    def test_play(self, n, monitordir, load_model=False):

        if load_model:
            self.load_model()

        if monitordir:
            env = wrappers.Monitor(gym.make(self.ENV_ID),
                                   monitordir,
                                   force=True,
                                   video_callable=(lambda ep: ep % 1 == 0))
        else:
            env = gym.make(self.ENV_ID)

        for i in range(n):

            total_reward = 0

            steps = 0

            done = False

            state = env.reset()

            while not done:

                action = self.actor_network.sample_action(state, noise=False)

                next_state, reward, done, _ = env.step(action)

                state = next_state

                total_reward += reward

                steps += 1

            print()
            print(f"Test Play {i}: {total_reward}")
            print(f"Steps:", steps)
            print()