class ActorCritic:
    def __init__(self,
                 sess,
                 training_steps=5000000,
                 learning_rate=0.0001,
                 momentum=0.95,
                 memory_size=100000,
                 discount_rate=0.95,
                 eps_min=0.05):
        self.activation = tf.nn.relu
        self.optimizer = tf.train.MomentumOptimizer
        self.learning_rate = learning_rate
        self.momentum = momentum

        self._build_graph()

        self.memory_size = memory_size
        self.memory = ReplayMemory(self.memory_size)
        '''
        The discount rate is the parameter that indicates how many actions will be considered in the future to evaluate
        the reward of a given action.
        A value of 0 means the agent only considers the present action, and a value close to 1 means the agent
        considers actions very far in the future.
        '''
        self.discount_rate = discount_rate

        self.eps_min = eps_min
        self.eps_decay_steps = int(training_steps / 2)

        self.sess = sess
        self.init = tf.global_variables_initializer()

    def cnn_model(self, X_state, name):
        """
        Creates a CNN network with two convolutional layers followed by two fully connected layers.
        
        :param X_state: Placeholder for the state of the game
        :param name: Name of the network (actor or critic)
        :return : The output (logits) layer and the trainable variables
        """

        initializer = tf.contrib.layers.variance_scaling_initializer()

        conv1_fmaps = 32
        conv1_ksize = 8
        conv1_stride = 2
        conv1_pad = 'SAME'

        conv2_fmaps = 64
        conv2_ksize = 4
        conv2_stride = 2
        conv2_pad = 'SAME'

        n_fc1 = 256

        with tf.variable_scope(name) as scope:

            conv1 = tf.layers.conv2d(X_state,
                                     filters=conv1_fmaps,
                                     kernel_size=conv1_ksize,
                                     activation=self.activation,
                                     strides=conv1_stride,
                                     padding=conv1_pad,
                                     name='conv1')

            conv2 = tf.layers.conv2d(conv1,
                                     filters=conv2_fmaps,
                                     kernel_size=conv2_ksize,
                                     activation=self.activation,
                                     strides=conv2_stride,
                                     padding=conv2_pad,
                                     name='conv2')

            conv2_flat = tf.reshape(conv2, shape=[-1, conv2_fmaps * 5 * 5])

            fc1 = tf.layers.dense(conv2_flat,
                                  n_fc1,
                                  activation=self.activation,
                                  name='fc1',
                                  kernel_initializer=initializer)

            logits = tf.layers.dense(fc1,
                                     N_OUTPUTS,
                                     kernel_initializer=initializer)

            trainable_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)

            trainable_vars_by_name = {
                var.name[len(scope.name):]: var
                for var in trainable_vars
            }
        return logits, trainable_vars_by_name

    def _build_graph(self):
        """
        Creates the Tensorflow graph of the CNN network.
        Two networks will be used, one for the actor, and one for the critic.
        """

        X_state = tf.placeholder(tf.float32, shape=[None, 20, 20, CHANNELS])
        actor_q_values, actor_vars = self.cnn_model(X_state, name="actor")
        critic_q_values, critic_vars = self.cnn_model(X_state, name="critic")

        with tf.variable_scope("train"):
            X_action = tf.placeholder(tf.int32, shape=[None])
            y = tf.placeholder(tf.float32, shape=[None, 1])
            '''A one hot vector (tf.one_hot) is used to only keep the Q-value corresponding to chosen action in the memory. 
            By multiplying the one-hot vector with the actor_q_values, this will zero out all of the Q-values except
            for the one corresponding to the memorized action. Then, by making sum along the first axis (axis=1), 
            we obtain the desired Q-value prediction for each memory.
            '''
            q_value = tf.reduce_sum(actor_q_values *
                                    tf.one_hot(X_action, N_OUTPUTS),
                                    axis=1,
                                    keep_dims=True)
            error = tf.abs(y - q_value)
            loss = tf.reduce_mean(clipped_error(error))
            global_step = tf.Variable(0, trainable=False,
                                      name='global_step')  # iteration step
            optimizer = self.optimizer(self.learning_rate,
                                       self.momentum,
                                       use_nesterov=True)
            training_op = optimizer.minimize(loss, global_step=global_step)

        self.saver = tf.train.Saver()
        self.X_state = X_state
        self.X_action = X_action
        self.y = y
        self.training_op = training_op
        self.loss = loss
        self.actor_q_values, self.actor_vars = actor_q_values, actor_vars
        self.critic_q_values, self.critic_vars = critic_q_values, critic_vars
        self.global_step = global_step

        with tf.variable_scope('summary'):
            self.loss_summary = tf.summary.scalar('loss', loss)
            self.mean_score = tf.placeholder(tf.float32, None)
            self.score_summary = tf.summary.scalar('mean score',
                                                   self.mean_score)
            self.summary_merged = tf.summary.merge(
                [self.loss_summary, self.score_summary])

    def start(self, checkpoint_path):
        """
        Intialize the model or restore the model if it already exists.
        
        :return: Iteration that we want the model to start training
        """
        if os.path.isfile(checkpoint_path + '.index'):
            self.saver.restore(self.sess, checkpoint_path)
            training_start = 1
            print('Restoring model...')
        else:
            # Make the model warm up before training
            training_start = 10000
            self.init.run()
            self.make_copy().run()
            print('New model...')
        return training_start
        return training_start

    def train(self, checkpoint_path, file_writer, mean_score):
        """
        Trains the agent and writes regularly a training summary.

        :param checkpoint_path: The path where the model will be saved
        :param file_writer: The file where the training summary will be written for Tensorboard visualization
        :param mean_score: The mean game score
        """
        copy_steps = 5000
        save_steps = 2000
        summary_steps = 500

        cur_states, actions, rewards, next_states, dones = self.sample_memories(
        )

        next_q_values = self.critic_q_values.eval(
            feed_dict={self.X_state: next_states})
        max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)
        y_vals = rewards + (1 - dones) * self.discount_rate * max_next_q_values
        _, loss_val = self.sess.run([self.training_op, self.loss],
                                    feed_dict={
                                        self.X_state: cur_states,
                                        self.X_action: actions,
                                        self.y: y_vals
                                    })

        step = self.global_step.eval()

        # Regularly copy the online DQN to the target DQN
        if step % copy_steps == 0:
            self.make_copy().run()

        # Save the model regularly
        if step % save_steps == 0:
            self.saver.save(self.sess, checkpoint_path)

        # Write the training summary regularly
        if step % summary_steps == 0:
            summary = self.sess.run(self.summary_merged,
                                    feed_dict={
                                        self.X_state: cur_states,
                                        self.X_action: actions,
                                        self.y: y_vals,
                                        self.mean_score: mean_score
                                    })

            file_writer.add_summary(summary, step)

    def predict(self, cur_state):
        """
        Makes the actor predict q-values based on the current state of the game.
        
        :param cur_state: Current state of the game
        :return The Q-values predicted by the actor
        """
        q_values = self.actor_q_values.eval(
            feed_dict={self.X_state: [cur_state]})
        return q_values

    def remember(self, cur_state, action, reward, new_state, done):
        self.memory.append([cur_state, action, reward, new_state, done])

    def act(self, cur_state, step):
        """
        :param cur_state: Current state of the game
        :param step: Training step
        :return: Action selected by the agent
        """
        eps_max = 1.0
        epsilon = max(
            self.eps_min, eps_max -
            (eps_max - self.eps_min) * 2 * step / self.eps_decay_steps)
        if np.random.rand() < epsilon:
            return np.random.randint(N_OUTPUTS), epsilon  # Random action
        else:
            q_values = self.predict(cur_state)
            return np.argmax(q_values), epsilon  # Optimal action

    def make_copy(self):
        """
        Makes regular copies of the training varibales from the critic to the actor.
        Credits goes to https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb.
        
        :return: A copy of the training variables
        """
        copy_ops = [
            target_var.assign(self.actor_vars[var_name])
            for var_name, target_var in self.critic_vars.items()
        ]
        copy_online_to_target = tf.group(*copy_ops)
        return copy_online_to_target

    def sample_memories(self, batch_size=32):
        """
        Extracts memories from the agent's memory.
        Credits goes to https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb.
        
        :param batch_size: Size of the batch that we extract form the memory
        :return: State, action, reward, next_state, and done values as np.arrays
        """
        cols = [[], [], [], [], []]  # state, action, reward, next_state, done
        for memory in self.memory.sample(batch_size):
            for col, value in zip(cols, memory):
                col.append(value)
        cols = [np.array(col) for col in cols]
        return cols[0], cols[1], cols[2].reshape(-1,
                                                 1), cols[3], cols[4].reshape(
                                                     -1, 1)
Beispiel #2
0
class TD3Agent:
    def __init__(self,
                 env,
                 n_episodes=3000,
                 time_steps=500,
                 gamma=0.99,
                 batch_size=64,
                 memory_capacity=100000,
                 tau=1e-2,
                 lr=0.00001,
                 pi_update_steps=2,
                 render=False):
        self.env = env
        self.gamma = gamma
        self.time_steps = time_steps
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.batch_size = batch_size
        self.memory_capacity = memory_capacity
        self.tau = tau
        self.lr = lr
        self.pi_update_steps = pi_update_steps
        self.render = render

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # Create actor and critic network
        self.actor = Actor(state_dim=self.state_dim,
                           action_dim=self.action_dim).to(self.device)
        self.actor_target = Actor(state_dim=self.state_dim,
                                  action_dim=self.action_dim).to(self.device)

        self.critic = Critic(state_dim=self.state_dim,
                             action_dim=self.action_dim).to(self.device)
        self.critic_target = Critic(state_dim=self.state_dim,
                                    action_dim=self.action_dim).to(self.device)

        # Same weights for target network as for original network
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.critic_loss_fct = torch.nn.MSELoss()

        self.actor_optim = torch.optim.Adam(self.actor.parameters(),
                                            lr=self.lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=self.lr * 10)

        self.n_episodes = n_episodes
        self.replay_memory = ReplayMemory(capacity=self.memory_capacity,
                                          batch_size=batch_size)

        self.res = pd.DataFrame({
            'episodes': [],
            'states': [],
            'rewards': [],
            'steps': [],
            'actor_losses': [],
            'critic_losses': [],
        })

    def train(self):
        for i in range(self.n_episodes):
            state = self.env.reset()

            for step in range(self.time_steps):
                if self.render:
                    self.env.render()

                state = tt(state)
                action = self.actor(state).cpu().detach().numpy()

                noise = np.random.normal(0,
                                         0.1,
                                         size=self.env.action_space.shape[0])
                action = np.clip(action + noise, self.env.action_space.low[0],
                                 self.env.action_space.high[0])
                next_state, reward, done, _ = self.env.step(action)

                # Save step in memory
                self.replay_memory.append(state=state,
                                          action=action,
                                          reward=reward,
                                          next_state=next_state,
                                          done=done)

                res = {
                    'episodes': i + 1,
                    'states': state.tolist(),
                    'rewards': reward,
                    'steps': step + 1
                }

                # Start training, if batch size reached
                if len(self.replay_memory) < self.batch_size:
                    self.res = self.res.append([res])
                    continue

                # Sample batch from memory
                states, actions, rewards, next_states, dones = self.replay_memory.sample_batch(
                )

                # Critic loss
                q1, q2 = self.critic(states, actions)
                next_actions = self.actor_target(next_states)

                noise = tt(torch.Tensor(actions.cpu()).data.normal_(0, 0.2))
                noise = noise.clamp(-0.5, 0.5)
                next_actions = (next_actions + noise).clamp(
                    self.env.action_space.low[0],
                    self.env.action_space.high[0])
                # Get next state q values by Clipped Double Q-Learning
                q1_ns, q2_ns = self.critic_target(next_states,
                                                  next_actions.detach())
                q_ns = torch.min(q1_ns, q2_ns)
                td_target = rewards + self.gamma * q_ns

                loss_critic = self.critic_loss_fct(
                    q1, td_target) + self.critic_loss_fct(q2, td_target)
                res['critic_losses'] = float(loss_critic)

                # Optimize critic
                self.critic_optim.zero_grad()
                loss_critic.backward()
                self.critic_optim.step()

                # Delayed Policy Updates
                if step % self.pi_update_steps == 0:
                    q1, _ = self.critic(states, self.actor(states))
                    # Actor loss
                    loss_actor = -q1.mean()
                    res['actor_losses'] = float(loss_actor)

                    # Optimize actor
                    self.actor_optim.zero_grad()
                    loss_actor.backward()
                    self.actor_optim.step()

                    # update target networks
                    for param, target_param in zip(
                            self.critic.parameters(),
                            self.critic_target.parameters()):
                        target_param.data.copy_(self.tau * param.data +
                                                (1 - self.tau) *
                                                target_param.data)

                    for param, target_param in zip(
                            self.actor.parameters(),
                            self.actor_target.parameters()):
                        target_param.data.copy_(self.tau * param.data +
                                                (1 - self.tau) *
                                                target_param.data)

                self.res = self.res.append([res])
                state = next_state
                if done:
                    break

            logging.info(f'Episode {i + 1}:')
            logging.info(
                f'\t Steps: {self.res.loc[self.res["episodes"] == i + 1]["steps"].max()}'
            )
            logging.info(
                f'\t Reward: {self.res.loc[self.res["episodes"] == i + 1]["rewards"].sum()}'
            )

        self.env.close()
        return self.res
Beispiel #3
0
class DDPGAgent:
    def __init__(self,
                 env,
                 n_episodes=3000,
                 time_steps=500,
                 gamma=0.99,
                 batch_size=32,
                 memory_capacity=100000,
                 tau=1e-2,
                 eps=0.1,
                 lr=0.00001,
                 render=False):
        self.env = env
        self.gamma = gamma
        self.time_steps = time_steps
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.batch_size = batch_size
        self.memory_capacity = memory_capacity
        self.tau = tau
        self.eps = eps
        self.lr = lr
        self.render = render

        # Same weights for target network as for original network
        self.actor = Actor(state_dim=self.state_dim,
                           action_dim=self.action_dim)
        self.actor_target = Actor(state_dim=self.state_dim,
                                  action_dim=self.action_dim)

        self.critic = Critic(state_dim=self.state_dim,
                             action_dim=self.action_dim)
        self.critic_target = Critic(state_dim=self.state_dim,
                                    action_dim=self.action_dim)

        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.critic_loss_fct = torch.nn.MSELoss()

        self.actor_optim = torch.optim.Adam(self.actor.parameters(),
                                            lr=self.lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=self.lr * 10)

        self.n_episodes = n_episodes

        self.replay_memory = ReplayMemory(capacity=self.memory_capacity,
                                          batch_size=batch_size)

        self.res = pd.DataFrame({
            'episodes': [],
            'states': [],
            'rewards': [],
            'steps': []
        })

    def train(self):
        for i in range(self.n_episodes):
            steps = 0
            state = self.env.reset()

            for step in range(self.time_steps):
                if self.render:
                    self.env.render()

                state = tt(state)
                action = self.actor(state).detach().numpy()

                # Exploration
                p = np.random.random()
                if p < self.eps:
                    action = np.random.uniform(low=-1, high=1, size=(1, ))
                # Do one step in env
                next_state, reward, done, _ = self.env.step(action)

                res = {
                    'episodes': i + 1,
                    'states': state.tolist(),
                    'rewards': reward,
                    'steps': step + 1
                }

                # Save step in memory
                self.replay_memory.append(state=state,
                                          action=action,
                                          reward=reward,
                                          next_state=next_state,
                                          done=done)

                # Start training, if batch size reached
                if len(self.replay_memory) < self.batch_size:
                    continue

                # Sample batch from memory
                states, actions, rewards, next_states, dones = self.replay_memory.sample_batch(
                )

                # Critic loss
                q_values = self.critic(states, actions)
                next_actions = self.actor_target(next_states)
                q_values_ns = self.critic_target(next_states,
                                                 next_actions.detach())
                td_target = rewards + self.gamma * q_values_ns
                loss_critic = self.critic_loss_fct(q_values, td_target)

                # Actor loss
                loss_actor = -(self.critic(states, self.actor(states)).mean())

                # Optimize actor
                self.actor_optim.zero_grad()
                loss_actor.backward()
                self.actor_optim.step()

                # Optimize critic
                self.critic_optim.zero_grad()
                loss_critic.backward()
                self.critic_optim.step()

                # update target networks
                for target_param, param in zip(self.actor_target.parameters(),
                                               self.actor.parameters()):
                    target_param.data.copy_(param.data * self.tau +
                                            target_param.data *
                                            (1.0 - self.tau))

                for target_param, param in zip(self.critic_target.parameters(),
                                               self.critic.parameters()):
                    target_param.data.copy_(param.data * self.tau +
                                            target_param.data *
                                            (1.0 - self.tau))

                self.res = self.res.append([res])

                state = next_state
                steps += 1

                if done:
                    break

            logging.info(f'Episode {i + 1}:')
            logging.info(
                f'\t Steps: {self.res.loc[self.res["episodes"] == i + 1]["steps"].max()}'
            )
            logging.info(
                f'\t Reward: {self.res.loc[self.res["episodes"] == i + 1]["rewards"].sum()}'
            )

        self.env.close()
        return self.res