class Tester:

    def __init__(self, env_name):

        self.env_name = env_name
        self.action_space = gym.make(self.env_name).action_space.n
        self.q_network = QNetwork(self.action_space)
        self.define_network()

    def define_network(self):
        env = gym.make(self.env_name)
        state = env.reset()
        self.q_network(np.atleast_2d(state))

    def test_play(self, current_weights, epsilon):

        self.q_network.set_weights(current_weights)

        env = gym.make(self.env_name)
        state = env.reset()
        episode_rewards = 0
        done = False
        while not done:
            action = self.q_network.sample_action(state, epsilon)
            next_state, reward, done, _ = env.step(action)
            episode_rewards += reward
            state = next_state

        return episode_rewards
class Learner:

    def __init__(self, gamma, env_name):
        self.env_name = env_name
        self.action_space = gym.make(self.env_name).action_space.n
        self.q_network = QNetwork(self.action_space)
        self.target_q_network = QNetwork(self.action_space)
        self.gamma = gamma
        self.optimizer = tf.keras.optimizers.Adam(lr=0.001)

    def define_network(self):
        env = gym.make(self.env_name)
        state = env.reset()
        self.q_network(np.atleast_2d(state))
        self.target_q_network(np.atleast_2d(state))
        self.target_q_network.set_weights(self.q_network.get_weights())
        current_weights = self.q_network.get_weights()
        return current_weights

    def update_network(self, minibatchs):

        indices_all = []
        td_errors_all = []

        for (indices, weights, transitions) in minibatchs:

            states, actions, rewards, next_states, dones = zip(*transitions)

            states = np.vstack(states)
            actions = np.array(actions)
            rewards = np.vstack(rewards)
            next_states = np.vstack(next_states)
            dones = np.vstack(dones)

            next_qvalues = self.q_network(next_states)
            next_actions = tf.cast(tf.argmax(next_qvalues, axis=1), tf.int32)
            next_actions_onehot = tf.one_hot(next_actions, self.action_space)
            next_maxQ = tf.reduce_sum(
                next_qvalues * next_actions_onehot, axis=1, keepdims=True)
            TQ = rewards + self.gamma * (1 - dones) * next_maxQ

            with tf.GradientTape() as tape:
                qvalues = self.q_network(states)
                actions_onehot = tf.one_hot(actions, self.action_space)
                Q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True)
                td_errors = tf.square(TQ - Q)
                loss = tf.reduce_mean(weights * td_errors)

            grads = tape.gradient(loss, self.q_network.trainable_variables)
            grads, _ = tf.clip_by_global_norm(grads, 40.0)
            self.optimizer.apply_gradients(
                zip(grads, self.q_network.trainable_variables))

            indices_all += indices
            td_errors_all += td_errors.numpy().flatten().tolist()

        current_weights = self.q_network.get_weights()
        return current_weights, indices_all, td_errors_all
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_shape, action_size, buffer_size, batch_size,
                 gamma, tau, learning_rate, update_every, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            tau (float): used for soft update of target parameters
            learning_rate (float): learning rate
            update_every (int): how many steps between network updates
            device (torch.Device): pytorch device
            seed (int): random seed
        """
        self.state_shape = state_shape
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(action_size)
        self.qnetwork_target = QNetwork(action_size)

        #self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

        self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.00025,
                                                     momentum=0.95)
        self.loss_fn = tf.keras.losses.MeanSquaredError()

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)

        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """Adds new experience to the replay buffer and learns from a subset of memories.

        Params
        ======
        state (array_like): initial state
        action (int): chosen action
        reward (int): reward given
        next_state (array_like): next state
        done (bool): True if the episode is finished
        """

        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every self.update_every time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()

                self.learn(experiences)

    def act(self, state, epsilon=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            epsilon (float): epsilon, for epsilon-greedy action selection
        """
        #state = tf.expand_dims(state, axis=0)
        action_values = self.qnetwork_local(state)

        # Epsilon-greedy action selection
        if random.random() > epsilon:
            return np.argmax(action_values.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states)
        Q_targets_next = tf.math.reduce_max(Q_targets_next, axis=1)
        Q_targets_next = tf.expand_dims(Q_targets_next, axis=1)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        with tf.GradientTape() as tape:
            # Get expected Q values from local model
            Q_expected = self.qnetwork_local(states)
            Q_expected = tf.gather(Q_expected,
                                   indices=actions,
                                   axis=1,
                                   batch_dims=1)
            loss = self.loss_fn(y_true=Q_targets, y_pred=Q_expected)

        gradients = tape.gradient(loss, self.qnetwork_local.trainable_weights)
        self.optimizer.apply_gradients(
            zip(gradients, self.qnetwork_local.trainable_weights))

        # ------------------- update target network ------------------- #
        #self.soft_update()

    def hard_update(self):
        self.qnetwork_target.set_weights(self.qnetwork_local.get_weights())

    def soft_update(self):
        """Soft update model parameters.

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
        """
        for target_layer, local_layer in zip(self.qnetwork_target.layers,
                                             self.qnetwork_local.layers):
            if target_layer.trainable:
                for i in range(len(target_layer.trainable_weights)):
                    target_layer.trainable_weights[
                        i] = self.tau * local_layer.trainable_weights[i] + (
                            1.0 - self.tau) * target_layer.trainable_weights[i]
Esempio n. 4
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size,
                 batch_size,
                 gamma,
                 tau,
                 lr,
                 update_every):
        """Initialize an Agent object.

        Args:
            state_size: Integer. Dimension of each state
            action_size: Integer. Dimension of each action
            buffer_size: Integer. Replay buffer size
            batch_size: Integer. Mini-batch size
            gamma: Float. Discount factor
            tau: Float. For soft update of target parameters
            lr: Float. Learning rate
            update_every: Integer. How often to update the network
        """
        # Environment parameters
        self.state_size = state_size
        self.action_size = action_size

        # Q-Learning
        self.gamma = gamma

        # Q-Network
        self.model_local = QNetwork(state_size, action_size)
        self.model_target = QNetwork(state_size, action_size)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        self.loss_fn = tf.keras.losses.MeanSquaredError(name="mse")
        self.tau = tau
        self.update_every = update_every
        self.batch_size = batch_size

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size)

        # Initialize time step (for updating every update_every steps)
        self.t_step = 0

    def __str__(self):
        return 'RL_Agent_Class'

    def __repr__(self):
        return 'RL_Agent_Class'

    def step(self, state, action, reward, next_state, done):
        """Save state on buffer and trigger learn according to update_every

        Args:
            state: The previous state of the environment
            action: Integer. Previous action selected by the agent
            reward: Float. Reward value
            next_state: The current state of the environment
            done: Boolean. Whether the episode is complete
        """
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every update_every time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:

            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Args:
            state: A array like object or list with states
            eps: Float. Random value for epsilon-greedy action selection

        Returns:
            An action selected by the network or by the epsilon-greedy method
        """
        # Reshape state
        state = np.expand_dims(state, 0)

        # Predict action
        action_values = self.model_local(state)

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values)
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Args:
            experiences: Tuple. Content of tuple (s, a, r, s', done)
        """
        states, actions, rewards, next_states, dones = experiences

        # Create mask to actions
        mask = tf.one_hot(actions.reshape(-1), self.action_size)

        with tf.GradientTape(persistent=True) as tape:

            # Get expected Q values from local model
            q_expected = tf.reduce_sum(self.model_local(states) * mask, axis=1, keepdims=True)

            # Get max predicted Q values (for next states) from target model
            q_targets_next = tf.reduce_max(self.model_target(next_states, training=True), axis=1)

            # Compute Q targets for current states
            q_targets = tf.add(
                rewards, tf.multiply(
                    self.gamma, q_targets_next, tf.subtract(1.0, dones))
            )

            # Compute loss
            loss = self.loss_fn(q_expected, q_targets)

        # Minimize the loss
        gradients = tape.gradient(loss, self.model_local.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model_local.trainable_variables))

        # Update target network
        self.soft_update()

    def soft_update(self):
        """Soft update model parameters.

        The model is update using:
            θ_target = τ * θ_local + (1 - τ) * θ_target

        """

        # Instantiate weight list
        new_weights = []

        # Apply soft update
        for weights in self.model_local.get_weights():
            new_weights.append(self.tau * weights + (1.0 - self.tau) * weights)

        # Set new weights
        self.model_target.set_weights(new_weights)
class Actor:

    def __init__(self, pid, epsilon, gamma, env_name):

        self.pid = pid
        self.env_name = env_name
        self.env = gym.make(self.env_name)
        self.action_space = self.env.action_space.n

        self.q_network = QNetwork(self.action_space)
        self.epsilon = epsilon
        self.gamma = gamma
        self.buffer = []

        self.state = self.env.reset()
        self.define_network()

        self.episode_rewards = 0

    def define_network(self):
        tf.config.set_visible_devices([], 'GPU')
        env = gym.make(self.env_name)
        state = env.reset()
        self.q_network(np.atleast_2d(state))

    def rollout(self, current_weights):
        #: グローバルQ関数と重みを同期
        self.q_network.set_weights(current_weights)

        #: rollout 100step
        for _ in range(100):
            state = self.state
            action = self.q_network.sample_action(state, self.epsilon)
            next_state, reward, done, _ = self.env.step(action)
            self.episode_rewards += reward
            transition = (state, action, reward, next_state, done)
            self.buffer.append(transition)

            if done:
                print(self.episode_rewards)
                self.state = self.env.reset()
                self.episode_rewards = 0
            else:
                self.state = next_state

        #: 初期優先度の計算
        states = np.vstack([transition[0] for transition in self.buffer])
        actions = np.array([transition[1] for trainsition in self.buffer])
        rewards = np.vstack([transition[2] for trainsition in self.buffer])
        next_states = np.vstack([transition[3] for transition in self.buffer])
        dones = np.vstack([transition[4] for transition in self.buffer])

        next_qvalues = self.q_network(next_states)
        next_actions = tf.cast(tf.argmax(next_qvalues, axis=1), tf.int32)
        next_actions_onehot = tf.one_hot(next_actions, self.action_space)
        next_maxQ = tf.reduce_sum(
            next_qvalues * next_actions_onehot, axis=1, keepdims=True)

        TQ = rewards + self.gamma * (1 - dones) * next_maxQ

        qvalues = self.q_network(states)
        actions_onehot = tf.one_hot(actions, self.action_space)
        Q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True)

        td_errors = (TQ - Q).numpy().flatten()
        transitions = self.buffer
        self.buffer = []

        return td_errors, transitions, self.pid
class DQNAgent:
    def __init__(self,
                 env_name="BreakoutDeterministic-v4",
                 gamma=0.99,
                 batch_size=32,
                 lr=0.00025,
                 update_period=4,
                 target_update_period=10000,
                 n_frames=4):

        self.env_name = env_name

        self.gamma = gamma

        self.batch_size = batch_size

        self.epsilon_scheduler = (
            lambda steps: max(1.0 - 0.9 * steps / 1000000, 0.1))

        self.update_period = update_period

        self.target_update_period = target_update_period

        env = gym.make(self.env_name)

        self.action_space = env.action_space.n

        self.qnet = QNetwork(self.action_space)

        self.target_qnet = QNetwork(self.action_space)

        self.optimizer = Adam(lr=lr, epsilon=0.01 / self.batch_size)

        self.n_frames = n_frames

        self.use_reward_clipping = True

        self.huber_loss = tf.keras.losses.Huber()

    def learn(self, n_episodes, buffer_size=1000000, logdir="log"):

        logdir = Path(__file__).parent / logdir
        if logdir.exists():
            shutil.rmtree(logdir)
        self.summary_writer = tf.summary.create_file_writer(str(logdir))

        self.replay_buffer = ReplayBuffer(max_len=buffer_size)

        steps = 0
        for episode in range(1, n_episodes + 1):
            env = gym.make(self.env_name)

            frame = preprocess_frame(env.reset())
            frames = collections.deque([frame] * self.n_frames,
                                       maxlen=self.n_frames)

            episode_rewards = 0
            episode_steps = 0
            done = False
            lives = 5

            while not done:

                steps, episode_steps = steps + 1, episode_steps + 1

                epsilon = self.epsilon_scheduler(steps)

                state = np.stack(frames, axis=2)[np.newaxis, ...]

                action = self.qnet.sample_action(state, epsilon=epsilon)

                next_frame, reward, done, info = env.step(action)

                episode_rewards += reward

                frames.append(preprocess_frame(next_frame))

                next_state = np.stack(frames, axis=2)[np.newaxis, ...]

                if info["ale.lives"] != lives:
                    lives = info["ale.lives"]
                    transition = (state, action, reward, next_state, True)
                else:
                    transition = (state, action, reward, next_state, done)

                self.replay_buffer.push(transition)

                if len(self.replay_buffer) > 50000:
                    if steps % self.update_period == 0:
                        loss = self.update_network()
                        with self.summary_writer.as_default():
                            tf.summary.scalar("loss", loss, step=steps)
                            tf.summary.scalar("epsilon", epsilon, step=steps)
                            tf.summary.scalar("buffer_size",
                                              len(self.replay_buffer),
                                              step=steps)
                            tf.summary.scalar("train_score",
                                              episode_rewards,
                                              step=steps)
                            tf.summary.scalar("train_steps",
                                              episode_steps,
                                              step=steps)

                    if steps % self.target_update_period == 0:
                        self.target_qnet.set_weights(self.qnet.get_weights())

                if done:
                    break

            print(
                f"Episode: {episode}, score: {episode_rewards}, steps: {episode_steps}"
            )
            if episode % 20 == 0:
                test_scores, test_steps = self.test_play(n_testplay=1)
                with self.summary_writer.as_default():
                    tf.summary.scalar("test_score", test_scores[0], step=steps)
                    tf.summary.scalar("test_step", test_steps[0], step=steps)

            if episode % 1000 == 0:
                self.qnet.save_weights("checkpoints/qnet")

    def update_network(self):

        #: ミニバッチの作成
        (states, actions, rewards, next_states,
         dones) = self.replay_buffer.get_minibatch(self.batch_size)

        if self.use_reward_clipping:
            rewards = np.clip(rewards, -1, 1)

        next_actions, next_qvalues = self.target_qnet.sample_actions(
            next_states)
        next_actions_onehot = tf.one_hot(next_actions, self.action_space)
        max_next_qvalues = tf.reduce_sum(next_qvalues * next_actions_onehot,
                                         axis=1,
                                         keepdims=True)

        target_q = rewards + self.gamma * (1 - dones) * max_next_qvalues

        with tf.GradientTape() as tape:

            qvalues = self.qnet(states)
            actions_onehot = tf.one_hot(actions.flatten().astype(np.int32),
                                        self.action_space)
            q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True)
            loss = self.huber_loss(target_q, q)

        grads = tape.gradient(loss, self.qnet.trainable_variables)
        self.optimizer.apply_gradients(
            zip(grads, self.qnet.trainable_variables))

        return loss

    def test_play(self, n_testplay=1, monitor_dir=None, checkpoint_path=None):

        if checkpoint_path:
            env = gym.make(self.env_name)
            frame = preprocess_frame(env.reset())
            frames = collections.deque([frame] * self.n_frames,
                                       maxlen=self.n_frames)

            state = np.stack(frames, axis=2)[np.newaxis, ...]
            self.qnet(state)
            self.qnet.load_weights(checkpoint_path)

        if monitor_dir:
            monitor_dir = Path(monitor_dir)
            if monitor_dir.exists():
                shutil.rmtree(monitor_dir)
            monitor_dir.mkdir()
            env = gym.wrappers.Monitor(gym.make(self.env_name),
                                       monitor_dir,
                                       force=True,
                                       video_callable=(lambda ep: True))
        else:
            env = gym.make(self.env_name)

        scores = []
        steps = []
        for _ in range(n_testplay):

            frame = preprocess_frame(env.reset())
            frames = collections.deque([frame] * self.n_frames,
                                       maxlen=self.n_frames)

            done = False
            episode_steps = 0
            episode_rewards = 0

            while not done:
                state = np.stack(frames, axis=2)[np.newaxis, ...]
                action = self.qnet.sample_action(state, epsilon=0.05)
                next_frame, reward, done, _ = env.step(action)
                frames.append(preprocess_frame(next_frame))

                episode_rewards += reward
                episode_steps += 1
                if episode_steps > 500 and episode_rewards < 3:
                    #: ゲーム開始(action: 0)しないまま停滞するケースへの対処
                    break

            scores.append(episode_rewards)
            steps.append(episode_steps)

        return scores, steps