Ejemplo n.º 1
0
class DQN:
    """
    DQN implementation. Note that only supports an environment that is gym-like.(i.e. reset, step, ..)
    """
    def __init__(self,
            env,
            obs_size = (115,),
            num_frame_stack = 1,
            batch_size = 32,
            mdp_gamma = 0.95,
            initial_epsilon = 1.0,
            min_epsilon = 0.1,
            epsilon_decay_steps = int(1e6),
            replay_capacity = int(1e5),
            min_replay_size = int(1e3),
            train_freq = 4,
            network_update_freq = 5000,
            regularization = 1e-6,
            optimizer_params = None,
            render = False):

            """
            Initialization function
            
            param env:                object. a gym-like environment which our RL agent interacts with
            parma obs_size:           list. the shape of the observation, i.e. (115,) for vector observation or (32,32) for image observation
            parma num_frame_stack:    int. number of stacked frames for network input
            param batch_size:         int. batch size
            param mdp_gamma:          float. MDP discount factor
            param initial_epsilon:    float. epsilon parameter of epsilon-greedy policy
            param min_epsilon:        float. minimum epsilon parameter of epsilon-greedy policy
            param epsilon_decay_steps: int. how many steps to decay epsilon 
            param replay_capacity:    int. replay buffer size
            param min_replay_size:    int. minimum replay buffer size
            param train_freq:         int. training frequency
            param network_update_freq: int. network update frequency
            param regularization:     float. regularization coefficient
            param optimizer_params:   dict. optimizer specilized parameters. i.e. learning rate, momentum
            param render:             bool. is render mode on?
            """
            
            # experience replay buffer for training
            self.exp_buffer = ExperienceReplay(
                num_frame_stack,
                capacity=replay_capacity,
                obs_size = obs_size
            )

            # experience replay buffer for playing/testing
            self.play_buffer = ExperienceReplay(
                num_frame_stack,
                capacity=num_frame_stack * 10,
                obs_size = obs_size
            )

            self.env = env
            self.obs_size = obs_size
            self.num_frame_stack = num_frame_stack
            self.batch_size = batch_size
            self.mdp_gamma = mdp_gamma
            self.initial_epsilon = initial_epsilon
            self.min_epsilon = min_epsilon
            self.epsilon_decay_steps = epsilon_decay_steps
            self.replay_capacity = replay_capacity
            self.min_replay_size = min_replay_size
            self.train_freq = train_freq
            self.network_update_freq = network_update_freq
            self.regularization = regularization
            self.render = render

            self.dim_actions = env.action_space.n
            self.dim_state = (num_frame_stack,) + self.obs_size

            if optimizer_params:
                self.optimizer_params = optimizer_params
            else:
                self.optimizer_params = dict(learning_rate = 0.0001, epsilon = 1e-7)

            self.is_training = True
            # epsilon used for playing
            # if 0, means that we just use the Q-network's optimal action without any exploration
            self.playing_epsilon = 0.0
            
            self.session = None
            
            self.global_counter = 0
            self.episode_counter = 0
            self.loss_history = []

    def get_variables(self,scope):
        """
        Get variables according to scope name
        """
        vars_list = []
        for var in tf.global_variables():
            if "%s/" % scope in var.name and "Adam" not in var.name:
                vars_list.append(var)
        return sorted(vars_list, key=lambda x: x.name)
    
    def get_epsilon(self):
        """
        Get current epsilon value.
        Note: with the training process, epsilon is decaying
        """
        if self.is_training == False:
            return self.playing_epsilon
        elif self.global_counter >= self.epsilon_decay_steps:
            return self.min_epsilon
        else:
            # for simplicity, just use linear decay
            return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * (1.0 - self.global_counter / float(self.epsilon_decay_steps))
            

    def network(self, input, trainable, use_image = False):
        """
        Implementation of Q(s,a) network
        
        param input:  tensor. [Batch_Size, N_State] or [Batch_Size, Num_stack_frame, H, W]

        """

        regularizer = None
        if trainable:
            regularizer = slim.l2_regularizer(self.regularization)
        
        if not use_image:
            # here use vanilla 4-layer perceptron
            # 1st layer
            net = slim.fully_connected(input, 512, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)
            # 2nd layer
            net = slim.fully_connected(net, 1024, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)
            # 3rd layer
            net = slim.fully_connected(net,512, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)
            # 4th layer
            #net = slim.fully_connected(net, 256, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)

            # output layer
            q_state_action_values = slim.fully_connected(net, self.dim_actions, activation_fn = None, weights_regularizer = regularizer, trainable = trainable)

        else:
            
            x = tf.transpose(input, [0,2,3,1])

            net = slim.conv2d(x, 8, (7,7),  stride = 3, data_format = "NHWC", activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)
            net = slim.max_pool2d(net, 2, 2)
            net = slim.conv2d(net, 16, (3,3), stride = 1, data_format = "NHWC", activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)
            net = slim.max_pool2d(net, 2, 2)
            net = slim.flatten(net)
            net = slim.fully_connected(net, 256, activation_fn = tf.nn.relu, weights_regularizer = regularizer, trainable = trainable)
            q_state_action_values = slim.fully_connected(net, self.dim_actions, activation_fn = None, weights_regularizer = regularizer, trainable = trainable)
        
        return q_state_action_values

    def sample_random_action(self):
        """
        Randomly sample an action for rollout
        """
        return np.random.choice(self.dim_actions)
    
    
    
    def setup_graph(self, use_image = False, if_soft = True):
        """
        Set up tensorflow computing graph
        """

        # define a bunch of placeholders
        if use_image:
            input_next_state_shape = (self.batch_size, self.num_frame_stack) + self.obs_size
            input_prev_state_shape = (None, self.num_frame_stack) + self.obs_size
        else:
            input_next_state_shape = (self.batch_size, self.obs_size[0])
            input_prev_state_shape = (None, self.obs_size[0])

        self.input_prev_state = tf.placeholder(tf.float32, input_prev_state_shape, name = "input_prev_state")
        self.input_next_state = tf.placeholder(tf.float32, input_next_state_shape, name = "input_next_state")
        self.input_actions = tf.placeholder(tf.int32, self.batch_size, name = "input_actions")
        self.input_reward = tf.placeholder(tf.float32, self.batch_size, name = "input_reward")
        self.is_done = tf.placeholder(tf.int32, self.batch_size, name = "is_done")

        self.optimizer = tf.train.AdamOptimizer(**(self.optimizer_params))
        """
        Q-learning:
        1. take action a_t according to epsilon-greedy policy
        2. store transition (s_t, a_t, r_t+1, s_t+1) in replay buffer D
        3. sample random mini-batch of transitions (s,a,r,s') from D
        3. compute Q-learning targets w.r.t. old, fixed parameters w-
        4. optimise MSE between Q-network and Q-learning targets

        L(w) = E{s,a,r,s' ~ D} [(r + \gamma \max_a'  Q(s',a',w-) - Q(s,a,w))^2]

        5. use variant of stochastic gradient descent
        """
        # Note: the following 2 networks need to have the same structure
        # fixed, old parameters Q-network for Q-target estimation
        with tf.variable_scope("target_q"):
            q_target = self.network(self.input_next_state, trainable=False, use_image = use_image)
        
        # trainable, new parameters Q-network for Q-learning
        with tf.variable_scope("update_q"):
            q_estimate = self.network(self.input_prev_state, trainable=True, use_image = use_image)
        # optimal action recovered by newest Q-network
        self.optimal_action = tf.argmax(q_estimate, axis = 1)
        
        not_done = tf.cast(tf.logical_not(tf.cast(self.is_done, "bool")), tf.float32)
        q_target_value = self.input_reward + not_done * self.mdp_gamma * tf.reduce_max(q_target, -1)

        # choose chosen self.input_actions from q_estimate to get values
        # first get indexes
        idx = tf.stack([tf.range(0, self.batch_size), self.input_actions], axis = 1)
        q_estimate_value = tf.gather_nd(q_estimate, idx)

        # MSE loss
        mse_loss = tf.nn.l2_loss(q_estimate_value - q_target_value) / self.batch_size
        # Regularization loss
        regularization_loss = tf.add_n(tf.losses.get_regularization_losses())

        self.loss = mse_loss + regularization_loss
        self.train_op = self.optimizer.minimize(self.loss)

        update_params = self.get_variables("update_q")
        target_params = self.get_variables("target_q")

        assert (len(update_params) == len(target_params))
        # weights copy op
        if if_soft:
            self.assign_op = [tf.assign(tp,0.001 * up + 0.999 * tp) for tp, up in zip(target_params, update_params)]
        else:
            self.assign_op = [tf.assign(tp,up) for tp, up in zip(target_params, update_params)]

    def train(self):
        """
        train step
        """
        # sample one mini-batch to compute mse
        batch = self.exp_buffer.sample_mini_batch(self.batch_size)
        if self.num_frame_stack > 1:
            # suppose use image observation
            feed_dict = {
                self.input_prev_state : batch["prev_state"],
                self.input_next_state : batch["next_state"],
                self.input_actions: batch["actions"],
                self.is_done: batch["done_mask"],
                self.input_reward: batch["reward"]
            }
        else:
            # reduce the axis 1
            feed_dict = {
                self.input_prev_state : batch["prev_state"][:,0,:],
                self.input_next_state : batch["next_state"][:,0,:],
                self.input_actions: batch["actions"],
                self.is_done: batch["done_mask"],
                self.input_reward: batch["reward"]
            }

        _, loss = self.session.run([self.train_op, self.loss], feed_dict=feed_dict)
        self.loss_history.append(loss)

        return loss

    def update_target_network(self):
        """
        Update target network
        """
        # no need for feed dicts
        self.session.run(self.assign_op)

    def play_episode(self):
        if self.is_training:
            rb = self.exp_buffer
        else:
            rb = self.play_buffer
        
        # total reward
        sum_reward = 0
        # total loss
        sum_loss = 0
        # steps
        steps_in_episode = 0

        first_obs = self.env.reset()
        rb.new_episode(first_obs)

        while True:
            if np.random.rand() > self.get_epsilon():
                if self.num_frame_stack > 1:
                    action = self.session.run(self.optimal_action, {self.input_prev_state: rb.current_state()[np.newaxis,:]})[0]
                else:
                    action = self.session.run(self.optimal_action, {self.input_prev_state: rb.current_state()})[0]
            else:
                action = self.sample_random_action()
             
            obs, reward, done, info = self.env.step(action)
            if self.render:
                self.env.render()
            else:
                pass
            
            sum_reward += reward
            steps_in_episode += 1

            # add one experience into buffer
            rb.add_experience(obs, action, done, reward)

            if self.is_training:
                self.global_counter += 1
                if self.global_counter % self.network_update_freq == 0:
                    self.update_target_network()
                if self.exp_buffer.counter >= self.min_replay_size and self.global_counter % self.train_freq == 0:
                    sum_loss += self.train()
            if done:
                if self.is_training:
                    self.episode_counter += 1
                
                return sum_reward, steps_in_episode, sum_loss / float(steps_in_episode)
Ejemplo n.º 2
0
class DQN:
    def __init__(self,
                 env,
                 batchsize=64,
                 pic_size=(96, 96),
                 num_frame_stack=4,
                 gamma=0.95,
                 frame_skip=1,
                 train_freq=4,
                 initial_epsilon=1.0,
                 min_epsilon=0.1,
                 render=True,
                 epsilon_decay_steps=int(1e6),
                 min_experience_size=int(1e3),
                 experience_capacity=int(1e5),
                 network_update_freq=5000,
                 regularization=1e-6,
                 optimizer_params=None,
                 action_map=None):
        self.exp_history = ExperienceReplay(num_frame_stack,
                                            capacity=experience_capacity,
                                            pic_size=pic_size)
        self.playing_cache = ExperienceReplay(num_frame_stack,
                                              capacity=num_frame_stack * 5 +
                                              10,
                                              pic_size=pic_size)
        self.network_update_freq = network_update_freq
        self.action_map = action_map
        self.env = env
        self.batchsize = batchsize
        self.num_frame_stack = num_frame_stack
        self.gamma = gamma
        self.frame_skip = frame_skip
        self.train_freq = train_freq
        self.initial_epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay_steps = epsilon_decay_steps
        self.render = render
        self.min_experience_size = min_experience_size
        self.pic_size = pic_size
        self.regularization = regularization
        self.optimizer_params = optimizer_params or dict(learning_rate=0.0004,
                                                         epsilon=1e-7)
        self.do_training = True
        self.playing_epsilon = 0.0
        self.session = None
        self.state_size = (self.num_frame_stack, ) + self.pic_size
        self.global_counter = 0
        self.episode_counter = 0

        if action_map is not None:
            self.dim_actions = len(action_map)
        else:
            self.dim_actions = env.action_space.n
        self.q_values = []
        self.loss_his = []

    @staticmethod
    def process_image(img):
        return 2 * color.rgb2gray(transform.rescale(img[34:194], 0.5)) - 1

    def build_graph(self):
        input_dim_with_batch = (self.batchsize,
                                self.num_frame_stack) + self.pic_size
        input_dim_general = (None, self.num_frame_stack) + self.pic_size

        self.input_prev_state = tf.placeholder(tf.float32, input_dim_general,
                                               "prev_state")
        self.input_next_state = tf.placeholder(tf.float32,
                                               input_dim_with_batch,
                                               "next_state")
        self.input_reward = tf.placeholder(tf.float32, self.batchsize,
                                           "reward")
        self.input_actions = tf.placeholder(tf.int32, self.batchsize,
                                            "actions")
        self.input_done_mask = tf.placeholder(tf.int32, self.batchsize,
                                              "done_mask")

        with tf.variable_scope("target"):
            qsa_targets = self.create_network(self.input_next_state,
                                              trainable=False)

        with tf.variable_scope("train"):
            qsa_estimates = self.create_network(self.input_prev_state,
                                                trainable=True)

        self.best_action = tf.argmax(qsa_estimates, axis=1)
        not_done = tf.cast(
            tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32")
        q_target = tf.reduce_max(
            qsa_targets, -1) * self.gamma * not_done + self.input_reward
        self.q_value_mean = tf.reduce_mean(q_target)
        action_slice = tf.stack(
            [tf.range(0, self.batchsize), self.input_actions], axis=1)
        q_estimates_for_input_action = tf.gather_nd(qsa_estimates,
                                                    action_slice)

        training_loss = tf.nn.l2_loss(
            q_target - q_estimates_for_input_action) / self.batchsize
        optimizer = tf.train.AdamOptimizer(**(self.optimizer_params))
        reg_loss = tf.add_n(tf.losses.get_regularization_losses())
        self.loss = tf.reduce_mean(reg_loss + training_loss)
        self.train_op = optimizer.minimize(reg_loss + training_loss)

        train_params = self.get_variables("train")
        target_params = self.get_variables("target")

        try:
            self.copy_network_ops = [
                tf.assign(target_v, train_v)
                for train_v, target_v in zip(train_params, target_params)
            ]
        except:
            print("error")

    def get_variables(self, scope):
        vars = [
            t for t in tf.global_variables()
            if "%s/" % scope in t.name and "Adam" not in t.name
        ]
        return sorted(vars, key=lambda v: v.name)

    def create_network(self, input, trainable):
        if trainable:
            wr = slim.l2_regularizer(self.regularization)
        else:
            wr = None

        input_t = tf.transpose(input, [0, 2, 3, 1])
        net = slim.conv2d(input_t,
                          8, (7, 7),
                          data_format="NHWC",
                          activation_fn=tf.nn.relu,
                          stride=3,
                          weights_regularizer=wr,
                          trainable=trainable)
        net = slim.max_pool2d(net, 2, 2)
        net = slim.conv2d(net,
                          16, (3, 3),
                          data_format="NHWC",
                          activation_fn=tf.nn.relu,
                          weights_regularizer=wr,
                          trainable=trainable)
        net = slim.max_pool2d(net, 2, 2)
        net = slim.flatten(net)
        fc_1 = slim.fully_connected(net,
                                    256,
                                    activation_fn=tf.nn.relu,
                                    weights_regularizer=wr,
                                    trainable=trainable)
        fc_2 = slim.fully_connected(net,
                                    256,
                                    activation_fn=tf.nn.relu,
                                    weights_regularizer=wr,
                                    trainable=trainable)
        value = slim.fully_connected(fc_1,
                                     1,
                                     activation_fn=None,
                                     weights_regularizer=wr,
                                     trainable=trainable)
        advantage = slim.fully_connected(fc_2,
                                         self.dim_actions,
                                         activation_fn=None,
                                         weights_regularizer=wr,
                                         trainable=trainable)
        q_state_action_values = value + (advantage - tf.reduce_mean(
            advantage, reduction_indices=[
                1,
            ], keepdims=True))

        return q_state_action_values

    def check_early_stop(self, reward, totalreward):
        return False, 0.0

    def get_random_action(self):
        return np.random.choice(self.dim_actions)

    def get_epsilon(self):
        if not self.do_training:
            return self.playing_epsilon
        elif self.global_counter >= self.epsilon_decay_steps:
            return self.min_epsilon
        else:
            # linear decay
            r = 1.0 - self.global_counter / float(self.epsilon_decay_steps)
            return self.min_epsilon + (self.initial_epsilon -
                                       self.min_epsilon) * r

    def train(self):
        batch = self.exp_history.sample_mini_batch(self.batchsize)

        fd = {
            self.input_reward: "reward",
            self.input_prev_state: "prev_state",
            self.input_next_state: "next_state",
            self.input_actions: "actions",
            self.input_done_mask: "done_mask"
        }
        fd1 = {ph: batch[k] for ph, k in fd.items()}
        _, action_value, loss = self.session.run(
            [self.train_op, self.q_value_mean, self.loss], fd1)
        self.q_values.append(action_value)
        self.loss_his.append(loss)

    def play_episode(self):
        eh = (self.exp_history if self.do_training else self.playing_cache)
        total_reward = 0
        frames_in_episode = 0

        first_frame = self.env.reset()
        first_frame_pp = self.process_image(first_frame)

        eh.start_new_episode(first_frame_pp)

        while True:
            if np.random.rand() > self.get_epsilon():
                action_idx = self.session.run(self.best_action, {
                    self.input_prev_state:
                    eh.current_state()[np.newaxis, ...]
                })[0]
            else:
                action_idx = self.get_random_action()

            if self.action_map is not None:
                action = self.action_map[action_idx]
            else:
                action = action_idx

            reward = 0
            for _ in range(self.frame_skip):
                observation, r, done, info = self.env.step(action)
                if self.render:
                    self.env.render()
                reward += r
                if done:
                    break

            early_done, punishment = self.check_early_stop(
                reward, total_reward)
            if early_done:
                reward += punishment

            done = done or early_done

            total_reward += reward
            frames_in_episode += 1

            eh.add_experience(self.process_image(observation), action_idx,
                              done, reward)

            if self.do_training:
                self.global_counter += 1
                if self.global_counter % self.network_update_freq:
                    self.update_target_network()
                train_cond = (
                    self.exp_history.counter >= self.min_experience_size
                    and self.global_counter % self.train_freq == 0)
                if train_cond:
                    self.train()

            if done:
                if self.do_training:
                    self.episode_counter += 1

                q_value = np.mean(self.q_values)
                loss = np.mean(self.loss_his)
                self.q_values = []
                self.loss_his = []
                return total_reward, frames_in_episode, q_value, loss

    def update_target_network(self):
        self.session.run(self.copy_network_ops)