def __init__(self,
                 env,
                 batchsize=64,
                 pic_size=(96, 96),
                 num_frame_stack=4,
                 gamma=0.95,
                 frame_skip=1,
                 train_freq=4,
                 initial_epsilon=1.0,
                 min_epsilon=0.1,
                 render=True,
                 epsilon_decay_steps=int(1e6),
                 min_experience_size=int(1e3),
                 experience_capacity=int(1e5),
                 network_update_freq=5000,
                 regularization=1e-6,
                 optimizer_params=None,
                 action_map=None):
        self.exp_history = ExperienceHistory(num_frame_stack,
                                             capacity=experience_capacity,
                                             pic_size=pic_size)

        # in playing mode we don't store the experience to agent history
        # but this cache is still needed to get the current frame stack
        self.playing_cache = ExperienceHistory(num_frame_stack,
                                               capacity=num_frame_stack * 5 +
                                               10,
                                               pic_size=pic_size)

        if action_map is not None:
            self.dim_actions = len(action_map)
        else:
            self.dim_actions = env.action_space.n

        self.network_update_freq = network_update_freq
        self.action_map = action_map
        self.env = env
        self.batchsize = batchsize
        self.num_frame_stack = num_frame_stack
        self.gamma = gamma
        self.frame_skip = frame_skip
        self.train_freq = train_freq
        self.initial_epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay_steps = epsilon_decay_steps
        self.render = render
        self.min_experience_size = min_experience_size
        self.pic_size = pic_size
        self.regularization = regularization
        # These default magic values always work with Adam
        self.optimizer_params = optimizer_params or dict(learning_rate=0.0004,
                                                         epsilon=1e-7)

        self.do_training = True
        self.playing_epsilon = 0.0
        self.session = None

        self.state_size = (self.num_frame_stack, ) + self.pic_size
        self.global_counter = 0
        self.episode_counter = 0
class DQN:
    """
    General DQN agent.
    Can be applied to any standard environment

    The implementation follows:
    Mnih et. al - Playing Atari with Deep Reinforcement Learning https://arxiv.org/pdf/1312.5602.pdf

    The q-network structure is different from the original paper

    see also:
    David Silver's RL course lecture 6: https://www.youtube.com/watch?v=UoPei5o4fps&t=1s
    """
    def __init__(self,
                 env,
                 batchsize=64,
                 pic_size=(96, 96),
                 num_frame_stack=4,
                 gamma=0.95,
                 frame_skip=1,
                 train_freq=4,
                 initial_epsilon=1.0,
                 min_epsilon=0.1,
                 render=True,
                 epsilon_decay_steps=int(1e6),
                 min_experience_size=int(1e3),
                 experience_capacity=int(1e5),
                 network_update_freq=5000,
                 regularization=1e-6,
                 optimizer_params=None,
                 action_map=None):
        self.exp_history = ExperienceHistory(num_frame_stack,
                                             capacity=experience_capacity,
                                             pic_size=pic_size)

        # in playing mode we don't store the experience to agent history
        # but this cache is still needed to get the current frame stack
        self.playing_cache = ExperienceHistory(num_frame_stack,
                                               capacity=num_frame_stack * 5 +
                                               10,
                                               pic_size=pic_size)

        if action_map is not None:
            self.dim_actions = len(action_map)
        else:
            self.dim_actions = env.action_space.n

        self.network_update_freq = network_update_freq
        self.action_map = action_map
        self.env = env
        self.batchsize = batchsize
        self.num_frame_stack = num_frame_stack
        self.gamma = gamma
        self.frame_skip = frame_skip
        self.train_freq = train_freq
        self.initial_epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay_steps = epsilon_decay_steps
        self.render = render
        self.min_experience_size = min_experience_size
        self.pic_size = pic_size
        self.regularization = regularization
        # These default magic values always work with Adam
        self.optimizer_params = optimizer_params or dict(learning_rate=0.0004,
                                                         epsilon=1e-7)

        self.do_training = True
        self.playing_epsilon = 0.0
        self.session = None

        self.state_size = (self.num_frame_stack, ) + self.pic_size
        self.global_counter = 0
        self.episode_counter = 0

    @staticmethod
    def process_image(img):
        return 2 * color.rgb2gray(transform.rescale(img[34:194], 0.5)) - 1

    def build_graph(self):
        input_dim_with_batch = (self.batchsize,
                                self.num_frame_stack) + self.pic_size
        input_dim_general = (None, self.num_frame_stack) + self.pic_size

        self.input_prev_state = tf.placeholder(tf.float32, input_dim_general,
                                               "prev_state")
        self.input_next_state = tf.placeholder(tf.float32,
                                               input_dim_with_batch,
                                               "next_state")
        self.input_reward = tf.placeholder(tf.float32, self.batchsize,
                                           "reward")
        self.input_actions = tf.placeholder(tf.int32, self.batchsize,
                                            "actions")
        self.input_done_mask = tf.placeholder(tf.int32, self.batchsize,
                                              "done_mask")

        # These are the state action values for all states
        # The target Q-values come from the fixed network

        with tf.variable_scope("fixed"):
            qsa_targets = self.create_network(self.input_next_state,
                                              trainable=False)

        # with tf.variable_scope("Dueling_DDQN"):
        #     qsa_targets_DDQN = tf.stop_gradient(self.create_network(self.input_next_state,trainable=True))
        #     target_action = tf.argmax(qsa_targets_DDQN, axis=1)
        #     target_action_onehot = tf.one_hot(indices=target_action,depth=self.dim_actions)
        #     qsa_targets = tf.stop_gradient(tf.reduce_sum(tf.multiply(qsa_targets,target_action_onehot),reduction_indices=[1,]))

        with tf.variable_scope("train"):
            qsa_estimates = self.create_network(self.input_prev_state,
                                                trainable=True)

        self.best_action = tf.argmax(qsa_estimates, axis=1)

        not_done = tf.cast(
            tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32")
        q_target = tf.reduce_max(
            qsa_targets, -1) * self.gamma * not_done + self.input_reward
        # q_target = qsa_targets * self.gamma * not_done + self.input_reward

        # select the chosen action from each row
        # in numpy this is qsa_estimates[range(batchsize), self.input_actions]
        action_slice = tf.stack(
            [tf.range(0, self.batchsize), self.input_actions], axis=1)
        q_estimates_for_input_action = tf.gather_nd(qsa_estimates,
                                                    action_slice)

        training_loss = tf.nn.l2_loss(
            q_target - q_estimates_for_input_action) / self.batchsize

        optimizer = tf.train.AdamOptimizer(**(self.optimizer_params))

        reg_loss = tf.add_n(tf.losses.get_regularization_losses())
        self.train_op = optimizer.minimize(reg_loss + training_loss)

        train_params = self.get_variables("train")
        fixed_params = self.get_variables("fixed")

        assert (len(train_params) == len(fixed_params))
        self.copy_network_ops = [
            tf.assign(fixed_v, train_v)
            for train_v, fixed_v in zip(train_params, fixed_params)
        ]

    def get_variables(self, scope):
        vars = [
            t for t in tf.global_variables()
            if "%s/" % scope in t.name and "Adam" not in t.name
        ]
        return sorted(vars, key=lambda v: v.name)

    # the dueling network
    def create_network(self, input, trainable):
        if trainable:
            wr = slim.l2_regularizer(self.regularization)
        else:
            wr = None

        # the input is stack of black and white frames.
        # put the stack in the place of channel (last in tf)
        input_t = tf.transpose(input, [0, 2, 3, 1])

        net = slim.conv2d(input_t,
                          8, (7, 7),
                          data_format="NHWC",
                          activation_fn=tf.nn.relu,
                          stride=3,
                          weights_regularizer=wr,
                          trainable=trainable)
        net = slim.max_pool2d(net, 2, 2)
        net = slim.conv2d(net,
                          16, (3, 3),
                          data_format="NHWC",
                          activation_fn=tf.nn.relu,
                          weights_regularizer=wr,
                          trainable=trainable)
        net = slim.max_pool2d(net, 2, 2)
        net = slim.flatten(net)
        fc_1 = slim.fully_connected(net,
                                    256,
                                    activation_fn=tf.nn.relu,
                                    weights_regularizer=wr,
                                    trainable=trainable)
        fc_2 = slim.fully_connected(net,
                                    256,
                                    activation_fn=tf.nn.relu,
                                    weights_regularizer=wr,
                                    trainable=trainable)
        value = slim.fully_connected(fc_1,
                                     1,
                                     activation_fn=None,
                                     weights_regularizer=wr,
                                     trainable=trainable)
        advantage = slim.fully_connected(fc_2,
                                         self.dim_actions,
                                         activation_fn=None,
                                         weights_regularizer=wr,
                                         trainable=trainable)
        q_state_action_values = value + (advantage - tf.reduce_mean(
            advantage, reduction_indices=[
                1,
            ], keepdims=True))

        return q_state_action_values

    def check_early_stop(self, reward, totalreward):
        return False, 0.0

    def get_random_action(self):
        return np.random.choice(self.dim_actions)

    def get_epsilon(self):
        if not self.do_training:
            return self.playing_epsilon
        elif self.global_counter >= self.epsilon_decay_steps:
            return self.min_epsilon
        else:
            # linear decay
            r = 1.0 - self.global_counter / float(self.epsilon_decay_steps)
            return self.min_epsilon + (self.initial_epsilon -
                                       self.min_epsilon) * r

    def train(self):
        batch = self.exp_history.sample_mini_batch(self.batchsize)

        fd = {
            self.input_reward: "reward",
            self.input_prev_state: "prev_state",
            self.input_next_state: "next_state",
            self.input_actions: "actions",
            self.input_done_mask: "done_mask"
        }
        fd1 = {ph: batch[k] for ph, k in fd.items()}
        self.session.run([self.train_op], fd1)

    def play_episode(self):
        eh = (self.exp_history if self.do_training else self.playing_cache)
        total_reward = 0
        frames_in_episode = 0

        first_frame = self.env.reset()
        first_frame_pp = self.process_image(first_frame)

        eh.start_new_episode(first_frame_pp)

        while True:
            if np.random.rand() > self.get_epsilon():
                action_idx = self.session.run(self.best_action, {
                    self.input_prev_state:
                    eh.current_state()[np.newaxis, ...]
                })[0]
            else:
                action_idx = self.get_random_action()

            if self.action_map is not None:
                action = self.action_map[action_idx]
            else:
                action = action_idx

            reward = 0
            for _ in range(self.frame_skip):
                observation, r, done, info = self.env.step(action)
                if self.render:
                    self.env.render()
                reward += r
                if done:
                    break

            early_done, punishment = self.check_early_stop(
                reward, total_reward)
            if early_done:
                reward += punishment

            done = done or early_done

            total_reward += reward
            frames_in_episode += 1

            eh.add_experience(self.process_image(observation), action_idx,
                              done, reward)

            if self.do_training:
                self.global_counter += 1
                if self.global_counter % self.network_update_freq:
                    self.update_target_network()
                train_cond = (
                    self.exp_history.counter >= self.min_experience_size
                    and self.global_counter % self.train_freq == 0)
                if train_cond:
                    self.train()

            if done:
                if self.do_training:
                    self.episode_counter += 1

                return total_reward, frames_in_episode

    def update_target_network(self):
        self.session.run(self.copy_network_ops)
Ejemplo n.º 3
0
class DQN:
    def __init__(self,
            env,
            batchsize=64,
            pic_size=(96, 96),
            num_frame_stack=4,
            gamma=0.95,
            frame_skip=1,
            train_freq=4,
            initial_epsilon=1.0,
            min_epsilon=0.1,
            render=True,
            epsilon_decay_steps=int(1e6),
            min_experience_size=int(1e3),
            experience_capacity=int(1e5),
            network_update_freq=5000,
            regularization = 1e-6,
            optimizer_params = None,
            action_map=None
    ):
        self.exp_history = ExperienceHistory(
            num_frame_stack,
            capacity=experience_capacity,
            pic_size=pic_size
        )

        # in playing mode we don't store the experience to agent history
        # but this cache is still needed to get the current frame stack
        self.playing_cache = ExperienceHistory(
            num_frame_stack,
            capacity=num_frame_stack * 5 + 10,
            pic_size=pic_size
        )

        if action_map is not None:
            self.dim_actions = len(action_map)
        else:
            self.dim_actions = env.action_space.n

        self.network_update_freq = network_update_freq
        self.action_map = action_map
        self.env = env
        self.batchsize = batchsize
        self.num_frame_stack = num_frame_stack
        self.gamma = gamma
        self.frame_skip = frame_skip
        self.train_freq = train_freq
        self.initial_epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay_steps = epsilon_decay_steps
        self.render = render
        self.min_experience_size = min_experience_size
        self.pic_size = pic_size
        self.regularization = regularization
        self.optimizer_params = optimizer_params or dict(learning_rate=0.0004, epsilon=1e-7)

        self.do_training = True
        self.playing_epsilon = 0.0
        self.session = None

        self.state_size = (self.num_frame_stack,) + self.pic_size
        self.global_counter = 0
        self.episode_counter = 0

    @staticmethod
    #chuyen frame nhan vao sang grayscale
    def preprocessing(img):
        return 2 * color.rgb2gray(transform.rescale(img[34:194], 0.5)) - 1

    def build_graph(self):
        input_dim_with_batch = (self.batchsize, self.num_frame_stack) + self.pic_size
        input_dim_general = (None, self.num_frame_stack) + self.pic_size

        self.input_prev_state = tf.placeholder(tf.float32, input_dim_general, "prev_state")
        self.input_next_state = tf.placeholder(tf.float32, input_dim_with_batch, "next_state")
        self.input_reward = tf.placeholder(tf.float32, self.batchsize, "reward")
        self.input_actions = tf.placeholder(tf.int32, self.batchsize, "actions")
        self.input_done_mask = tf.placeholder(tf.int32, self.batchsize, "done_mask")

        # These are the state action values for all states
        # The target Q-values come from the fixed network
        with tf.variable_scope("fixed"):
            qsa_targets = self.create_network(self.input_next_state, trainable=False)

        with tf.variable_scope("train"):
            qsa_estimates = self.create_network(self.input_prev_state, trainable=True)

        self.best_action = tf.argmax(qsa_estimates, axis=1)

        not_done = tf.cast(tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32")
        q_target = tf.reduce_max(qsa_targets, -1) * self.gamma * not_done + self.input_reward
        # lay action duoc chon o moi hang
        action_slice = tf.stack([tf.range(0, self.batchsize), self.input_actions], axis=1)
        q_estimates_for_input_action = tf.gather_nd(qsa_estimates, action_slice)

        training_loss = tf.nn.l2_loss(q_target - q_estimates_for_input_action) / self.batchsize

        optimizer = tf.train.AdamOptimizer(**(self.optimizer_params))

        reg_loss = tf.add_n(tf.losses.get_regularization_losses())
        self.train_op = optimizer.minimize(reg_loss + training_loss)

        train_params = self.get_variables("train")
        fixed_params = self.get_variables("fixed")

        assert (len(train_params) == len(fixed_params))
        self.copy_network_ops = [tf.assign(fixed_v, train_v)
            for train_v, fixed_v in zip(train_params, fixed_params)]

    def get_variables(self, scope):
        vars = [t for t in tf.global_variables()
            if "%s/" % scope in t.name and "Adam" not in t.name]
        return sorted(vars, key=lambda v: v.name)

    def create_network(self, input, trainable):
        if trainable:
            wr = slim.l2_regularizer(self.regularization)
        else:
            wr = None
        # dau vao la mot stack nhung frame trang den
        # truyen stack vao channel
        input_t = tf.transpose(input, [0, 2, 3, 1])

        net = slim.conv2d(input_t, 8, (7, 7), data_format="NHWC",
            activation_fn=tf.nn.relu, stride=3, weights_regularizer=wr, trainable=trainable)
        net = slim.max_pool2d(net, 2, 2)
        net = slim.conv2d(net, 16, (3, 3), data_format="NHWC",
            activation_fn=tf.nn.relu, weights_regularizer=wr, trainable=trainable)
        net = slim.max_pool2d(net, 2, 2)
        net = slim.flatten(net)
        net = slim.fully_connected(net, 256, activation_fn=tf.nn.relu,
            weights_regularizer=wr, trainable=trainable)
        q_state_action_values = slim.fully_connected(net, self.dim_actions,
            activation_fn=None, weights_regularizer=wr, trainable=trainable)

        return q_state_action_values

    def check_early_stop(self, reward, totalreward):
        return False, 0.0

    def get_random_action(self):
        return np.random.choice(self.dim_actions)

    def get_epsilon(self):
        if not self.do_training:
            return self.playing_epsilon
        elif self.global_counter >= self.epsilon_decay_steps:
            return self.min_epsilon
        else:
            r = 1.0 - self.global_counter / float(self.epsilon_decay_steps)
            return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r

    def train(self):
        batch = self.exp_history.sample_mini_batch(self.batchsize)

        fd = {
            self.input_reward: "reward",
            self.input_prev_state: "prev_state",
            self.input_next_state: "next_state",
            self.input_actions: "actions",
            self.input_done_mask: "done_mask"
        }
        fd1 = {ph: batch[k] for ph, k in fd.items()}
        self.session.run([self.train_op], fd1)

    def play_episode(self):
        eh = (
            self.exp_history if self.do_training
            else self.playing_cache
        )
        total_reward = 0
        frames_in_episode = 0

        first_frame = self.env.reset()
        first_frame_pp = self.preprocessing(first_frame)

        eh.start_new_episode(first_frame_pp)

        while True:
            if np.random.rand() > self.get_epsilon():
                action_idx = self.session.run(
                    self.best_action,
                    {self.input_prev_state: eh.current_state()[np.newaxis, ...]}
                )[0]
            else:
                action_idx = self.get_random_action()

            if self.action_map is not None:
                action = self.action_map[action_idx]
            else:
                action = action_idx

            reward = 0
            for _ in range(self.frame_skip):
                observation, r, done, info = self.env.step(action)
                if self.render:
                    self.env.render()
                reward += r
                if done:
                    break

            early_done, punishment = self.check_early_stop(reward, total_reward)
            if early_done:
                reward += punishment

            done = done or early_done

            total_reward += reward
            frames_in_episode += 1

            eh.add_experience(self.preprocessing(observation), action_idx, done, reward)

            if self.do_training:
                self.global_counter += 1
                if self.global_counter % self.network_update_freq:
                    self.update_target_network()
                train_cond = (
                    self.exp_history.counter >= self.min_experience_size and
                    self.global_counter % self.train_freq == 0
                )
                if train_cond:
                    self.train()

            if done:
                if self.do_training:
                    self.episode_counter += 1

                return total_reward, frames_in_episode

    def update_target_network(self):
        self.session.run(self.copy_network_ops)
Ejemplo n.º 4
0
class DQN:
    """
    General DQN agent.
    Can be applied to any standard environment
    The implementation follows:
    Mnih et. al - Playing Atari with Deep Reinforcement Learning https://arxiv.org/pdf/1312.5602.pdf
    The q-network structure is different from the original paper
    see also:
    David Silver's RL course lecture 6: https://www.youtube.com/watch?v=UoPei5o4fps&t=1s
    """
    def __init__(self,
                 env,
                 batchsize=64,
                 pic_size=(96, 96),
                 num_frame_stack=4,
                 gamma=0.95,
                 frame_skip=1,
                 train_freq=4,
                 initial_epsilon=1.0,
                 min_epsilon=0.1,
                 render=True,
                 epsilon_decay_steps=int(1e6),
                 min_experience_size=int(1e3),
                 experience_capacity=int(1e5),
                 network_update_freq=5000,
                 regularization=1e-6,
                 optimizer_params=None,
                 action_map=None):
        self.exp_history = ExperienceHistory(num_frame_stack,
                                             capacity=experience_capacity,
                                             pic_size=pic_size)

        # in playing mode we don't store the experience to agent history
        # but this cache is still needed to get the current frame stack
        self.playing_cache = ExperienceHistory(num_frame_stack,
                                               capacity=num_frame_stack * 5 +
                                               10,
                                               pic_size=pic_size)

        if action_map is not None:
            self.dim_actions = len(action_map)
        else:
            self.dim_actions = env.action_space.n

        self.network_update_freq = network_update_freq
        self.action_map = action_map
        self.env = env
        self.batchsize = batchsize
        self.num_frame_stack = num_frame_stack
        self.gamma = gamma
        self.frame_skip = frame_skip
        self.train_freq = train_freq
        self.initial_epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay_steps = epsilon_decay_steps
        self.render = render
        self.min_experience_size = min_experience_size
        self.pic_size = pic_size
        self.regularization = regularization
        # These default magic values always work with Adam
        self.optimizer_params = optimizer_params or dict(learning_rate=0.0004,
                                                         epsilon=1e-7)

        self.do_training = True
        self.playing_epsilon = 0.0
        self.session = None

        self.state_size = (self.num_frame_stack, ) + self.pic_size
        self.global_counter = 0
        self.episode_counter = 0

    @staticmethod
    def process_image(img):
        return 2 * color.rgb2gray(transform.rescale(img[34:194], 0.5)) - 1

    def kl_divergence(p, q):
        return tf.reduce_sum(p * tf.log(p / q))

    def sample_z(self, mu, logvar):
        eps = tf.random_normal(shape=tf.shape(mu))
        return mu + tf.exp(logvar / 2) * eps

    def RGB(self, x):

        #x = tf.placeholder(tf.float32, [64, 96, 96, 3], name='image')
        #x = tf.image.resize_images(x, [64, 64])
        x = tf.compat.v1.image.resize(x, [64, 96, 96, 3])
        #x = tf.image.resize_images(x, [None, 96, 96, 3])
        x = tf.layers.conv2d(x,
                             filters=32,
                             kernel_size=4,
                             strides=2,
                             padding='valid',
                             activation=tf.nn.elu)
        x = tf.layers.conv2d(x,
                             filters=32,
                             kernel_size=4,
                             strides=2,
                             padding='valid',
                             activation=tf.nn.elu)
        # x = tf.layers.conv2d(x, filters=64, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu)
        # x = tf.layers.conv2d(x, filters=128, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu)
        # x = tf.layers.conv2d(x, filters=256, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu)
        x = tf.layers.flatten(x)
        x = tf.reshape(x, [-1, 6272])
        print(x)
        z_mu = slim.fully_connected(x, 5, activation_fn=tf.nn.elu)
        z_var = slim.fully_connected(x, 5, activation_fn=tf.nn.elu)
        print("slim", z_mu)
        return z_mu, z_var

    def Event(self, x):

        #x = tf.placeholder(tf.float32, [64, 96, 96, 3], name='image')
        #x = tf.image.resize_images(x, [64, 64])
        x = tf.compat.v1.image.resize(x, [64, 96, 96, 3])

        #x = tf.image.resize_images(x,[None, 96, 96, 3])
        x = tf.layers.conv2d(x,
                             filters=32,
                             kernel_size=4,
                             strides=2,
                             padding='valid',
                             activation=tf.nn.elu)
        x = tf.layers.conv2d(x,
                             filters=32,
                             kernel_size=4,
                             strides=2,
                             padding='valid',
                             activation=tf.nn.elu)
        # x = tf.layers.conv2d(x, filters=64, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu)
        # x = tf.layers.conv2d(x, filters=128, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu)
        # x = tf.layers.conv2d(x, filters=256, kernel_size=4, strides=2, padding='valid', activation=tf.nn.elu)
        x = tf.layers.flatten(x)
        x = tf.reshape(x, [-1, 6272])

        print(x)
        z_mu = slim.fully_connected(x, 5, activation_fn=tf.nn.elu)
        z_var = slim.fully_connected(x, 5, activation_fn=tf.nn.elu)
        print("slim", z_mu)

        return z_mu, z_var

    def encoderRGB(self, x):

        #x = tf.placeholder(tf.float32, [None, 96, 96, 3], name='image')
        x = tf.image.resize_images(x, [96, 96])
        x = tf.layers.conv2d(x,
                             filters=32,
                             kernel_size=4,
                             strides=2,
                             padding='valid',
                             activation=tf.nn.elu)
        x = tf.layers.conv2d(x,
                             filters=64,
                             kernel_size=4,
                             strides=2,
                             padding='valid',
                             activation=tf.nn.elu)
        x = tf.layers.conv2d(x,
                             filters=128,
                             kernel_size=4,
                             strides=2,
                             padding='valid',
                             activation=tf.nn.elu)
        x = tf.layers.conv2d(x,
                             filters=256,
                             kernel_size=4,
                             strides=2,
                             padding='valid',
                             activation=tf.nn.elu)
        x = tf.layers.flatten(x)
        fc1 = tf.reshape(x, [-1, 4096])
        shapes = tf.shape(x)
        #z_mu = slim.fully_connected(fc1, 5, activation_fn=tf.nn.elu)
        z_mean = tf_contrib.layers.fully_connected(fc1, 32)

        shape = x.get_shape().as_list()
        z_mua = tf.layers.dense(fc1, units=32, name='z_mu')
        z_logvara = tf.layers.dense(fc1, units=32, name='z_logvar')
        # dim = np.prod(shape[1:])
        # x2 = tf.reshape(-1, x.get_shape())
        #print("dimension!!!",fc1)

        # x = tf.reshape(-1,4096)
        tf.reset_default_graph()

        # z_mus = tf.layers.dense(x2, units=32, name='z_mu')
        # z_logvars = tf.layers.dense(x2, units=32, name='z_logvar')

        return z_mean, z_logvara

    def encoderEvent(self, input):

        wr = slim.l2_regularizer(1e-6)

        input_t = tf.transpose(input, [0, 2, 3])
        net = slim.conv2d(input_t,
                          8, (7, 7),
                          data_format="NHWC",
                          activation_fn=tf.nn.relu,
                          stride=3,
                          weights_regularizer=wr)
        net = slim.max_pool2d(net, 2, 2)
        net = slim.conv2d(
            net,
            16,
            (3, 3),
            data_format="NHWC",
            activation_fn=tf.nn.relu,
            weights_regularizer=wr,
        )
        net = slim.max_pool2d(net, 2, 2)
        net = slim.flatten(net)
        net = slim.fully_connected(net,
                                   256,
                                   activation_fn=tf.nn.relu,
                                   weights_regularizer=wr)

        return net

    def compute_loss(self):
        logits_flat = tf.layers.flatten(self.reconstructions)
        labels_flat = tf.layers.flatten(self.resized_image)
        reconstruction_loss = tf.reduce_sum(tf.square(logits_flat -
                                                      labels_flat),
                                            axis=1)
        kl_loss = 0.5 * tf.reduce_sum(
            tf.exp(self.z_logvar) + self.z_mu**2 - 1. - self.z_logvar, 1)
        vae_loss = tf.reduce_mean(reconstruction_loss + kl_loss)
        return vae_loss

    def build_graph(self, env):

        input_dim_with_batch = (self.batchsize,
                                self.num_frame_stack) + self.pic_size
        input_dim_general = (None, self.num_frame_stack) + self.pic_size
        self.input_prev_state = tf.placeholder(tf.float32, input_dim_general,
                                               "prev_state")
        self.input_next_state = tf.placeholder(tf.float32,
                                               input_dim_with_batch,
                                               "next_state")
        self.input_reward = tf.placeholder(tf.float32, self.batchsize,
                                           "reward")
        self.input_actions = tf.placeholder(tf.int32, self.batchsize,
                                            "actions")
        self.input_done_mask = tf.placeholder(tf.int32, self.batchsize,
                                              "done_mask")
        #self.first_frame = tf.placeholder(tf.int32, self.batchsize, "first_frame")
        self.loss_vector = tf.placeholder(tf.float32, shape=self.batchsize)
        # These are the state action values for all states
        # The target Q-values come from the fixed network
        #ENCODER AND LOSS CALCULATION
        ####################################################################
        first_frame = env.reset()
        self.val = tf.placeholder(tf.float32, [64, 96, 96, 3], name='rgb')
        self.event = tf.placeholder(tf.float32, [64, 96, 96, 3], name='event')
        self.val, self.event = env.returnRgb()
        rgb_mu, rgb_var = self.RGB(self.val)
        events_mu, events_var = self.Event(self.event)
        rgb_std = tf.math.sqrt(rgb_var)
        event_std = tf.math.sqrt(events_mu)
        self.val_for_loss = ((rgb_std**2) +
                             ((rgb_mu - events_mu)**2)) / (2 * (rgb_std**2))
        self.loss_vector = tf.math.log(
            event_std / rgb_std) + self.val_for_loss - (1 / 2)

        # self.val_latent = self.sample_z(vals_mu,vals_var)
        # self.val_event = self.sample_z(events_mu,events_var)

        # x = tf.Print(self.loss_vector,[self.loss_vector])
        # sess = tf.InteractiveSession()
        # sess.run(x)
        # sess.close()

        # X = tf.distributions.Normal(probs=self.val_latent)
        # Y = tf.distributions.Normal(probs=self.val_event)
        # self.loss_vector = tf.distributions.kl_divergence(X, Y)
        #####################################################################
        with tf.variable_scope("fixed"):
            qsa_targets = self.create_network(self.input_next_state,
                                              trainable=False)

        with tf.variable_scope("train"):
            qsa_estimates = self.create_network(self.input_prev_state,
                                                trainable=True)

        self.best_action = tf.argmax(qsa_estimates, axis=1)
        not_done = tf.cast(
            tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32")
        q_target = tf.reduce_max(
            qsa_targets, -1) * self.gamma * not_done + self.input_reward
        # select the chosen action from each row
        # in numpy this is qsa_estimates[range(batchsize), self.input_actions]
        action_slice = tf.stack(
            [tf.range(0, self.batchsize), self.input_actions], axis=1)
        q_estimates_for_input_action = tf.gather_nd(qsa_estimates,
                                                    action_slice)

        training_loss = tf.nn.l2_loss(
            q_target - q_estimates_for_input_action) / self.batchsize
        optimizer = tf.train.AdamOptimizer(**(self.optimizer_params))
        reg_loss = tf.add_n(tf.losses.get_regularization_losses())
        # x = tf.Print(reg_loss, [reg_loss])
        # sess = tf.InteractiveSession()
        # sess.run(x)
        # sess.close()

        self.train_op = optimizer.minimize(reg_loss +
                                           training_loss)  #+ self.loss_vector)
        tf.print(self.train_op)

        train_params = self.get_variables("train")
        fixed_params = self.get_variables("fixed")

        assert (len(train_params) == len(fixed_params))
        self.copy_network_ops = [
            tf.assign(fixed_v, train_v)
            for train_v, fixed_v in zip(train_params, fixed_params)
        ]

        return self.train_op

    def get_variables(self, scope):
        vars = [
            t for t in tf.global_variables()
            if "%s/" % scope in t.name and "Adam" not in t.name
        ]
        return sorted(vars, key=lambda v: v.name)

    def create_network(self, input, trainable):
        if trainable:
            wr = slim.l2_regularizer(self.regularization)
        else:
            wr = None

        # the input is stack of black and white frames.
        # put the stack in the place of channel (last in tf)
        input_t = tf.transpose(input, [0, 2, 3, 1])
        net = slim.conv2d(input_t,
                          8, (7, 7),
                          data_format="NHWC",
                          activation_fn=tf.nn.relu,
                          stride=3,
                          weights_regularizer=wr,
                          trainable=trainable)
        net = slim.max_pool2d(net, 2, 2)
        net = slim.conv2d(net,
                          16, (3, 3),
                          data_format="NHWC",
                          activation_fn=tf.nn.relu,
                          weights_regularizer=wr,
                          trainable=trainable)
        net = slim.max_pool2d(net, 2, 2)
        net = slim.flatten(net)
        net = slim.fully_connected(net,
                                   256,
                                   activation_fn=tf.nn.relu,
                                   weights_regularizer=wr,
                                   trainable=trainable)
        q_state_action_values = slim.fully_connected(net,
                                                     self.dim_actions,
                                                     activation_fn=None,
                                                     weights_regularizer=wr,
                                                     trainable=trainable)

        return q_state_action_values

    def check_early_stop(self, reward, totalreward):
        return False, 0.0

    def get_random_action(self):
        return np.random.choice(self.dim_actions)

    def get_epsilon(self):
        if not self.do_training:
            return self.playing_epsilon
        elif self.global_counter >= self.epsilon_decay_steps:
            return self.min_epsilon
        else:
            # linear decay
            r = 1.0 - self.global_counter / float(self.epsilon_decay_steps)
            return self.min_epsilon + (self.initial_epsilon -
                                       self.min_epsilon) * r

    def train(self):
        batch = self.exp_history.sample_mini_batch(self.batchsize)

        fd = {
            self.input_reward: "reward",
            self.input_prev_state: "prev_state",
            self.input_next_state: "next_state",
            self.input_actions: "actions",
            self.input_done_mask: "done_mask"
        }
        fd1 = {ph: batch[k] for ph, k in fd.items()}
        results = self.session.run([self.train_op, self.loss_vector], fd1)
        self.graph = build_graph()
        with tf.InteractiveSession() as sess:
            print(sess.run([self.graph], fd1))

    def play_episode(self):
        eh = (self.exp_history if self.do_training else self.playing_cache)
        total_reward = 0
        frames_in_episode = 0
        # with tf.compat.v1.Session() as sess:

        first_frame = self.env.reset()
        #From CarRacing

        # sess.close()
        #value = tfp.distributions.kl_divergence(latent, latent_event, allow_nan_stats=True, name=None)

        #kl_r = rel_entr(latent,latent_event)
        #clear_session()
        # a = tf.constant([[4,3],[3,3]])
        # print(type(a))
        # sess = tf.InteractiveSession()
        # xo = tf.Print(mu,[mu])
        # sess.run(xo)
        # sess.close()
        wr = slim.l2_regularizer(self.regularization)

        # k = tf.keras.losses.KLDivergence()
        # loss = k(latent,latent_event)

        #PRINTING VALUES
        # a = tf.constant([[4,3],[3,3]])
        # x = tf.Print(self.val_latent,[self.val_latent])
        # sess = tf.InteractiveSession()
        # sess.run(x)
        # sess.close()

        first_frame_pp = self.process_image(first_frame)

        eh.start_new_episode(first_frame_pp)

        while True:
            if np.random.rand() > self.get_epsilon():
                action_idx = self.session.run(self.best_action, {
                    self.input_prev_state:
                    eh.current_state()[np.newaxis, ...]
                })[0]
                # action_idx, loss_vec = self.session.run(
                #     [self.best_action, self.loss_vector],
                #     {self.input_prev_state: eh.current_state()[np.newaxis, ...]}
                # )
            # print("Loss vec:", self.loss_vector)
            else:
                action_idx = self.get_random_action()

            if self.action_map is not None:
                action = self.action_map[action_idx]
            else:
                action = action_idx

            reward = 0
            for _ in range(self.frame_skip):
                observation, r, done, info = self.env.step(action)
                if self.render:
                    self.env.render()
                reward += r
                if done:
                    break

            early_done, punishment = self.check_early_stop(
                reward, total_reward)
            if early_done:
                reward += punishment

            done = done or early_done

            total_reward += reward
            frames_in_episode += 1

            eh.add_experience(self.process_image(observation), action_idx,
                              done, reward)

            if self.do_training:
                self.global_counter += 1
                if self.global_counter % self.network_update_freq:
                    self.update_target_network()
                train_cond = (
                    self.exp_history.counter >= self.min_experience_size
                    and self.global_counter % self.train_freq == 0)
                if train_cond:
                    self.train()

            if done:
                if self.do_training:
                    self.episode_counter += 1

                return total_reward, frames_in_episode

    def update_target_network(self):
        self.session.run(self.copy_network_ops)
Ejemplo n.º 5
0
    def test_many_frames(self):
        n_frames = 1000
        size = 30
        frames = np.ones(
            (n_frames, 2, 2)).astype("float32") * np.arange(n_frames).reshape(
                -1, 1, 1)
        start_frame = np.ones((2, 2), "float32") * 10000
        h = ExperienceHistory(num_frame_stack=num_frame_stack,
                              capacity=30,
                              pic_size=pic_size)

        h.start_new_episode(start_frame)

        #add 10 frames
        for f in frames[:10]:
            h.add_experience(f, 12, False, 5.0)

        this_state = h.current_state()
        h.add_experience(frames[10], 10, False, 4)

        def a():
            assert np.all(this_state == frames[7:10])
            assert h.rewards[10] == 4
            assert h.actions[10] == 10
            assert not h.is_done[10]
            assert np.all(h.frames[h.prev_states[10]] == frames[7:10])
            assert np.all(h.frames[h.next_states[10]] == frames[8:11])

        # Check that adding one frame
        # doesn't mess up the existing history
        a()

        # add 29 more experiences and check that
        # the past experience is not changed
        for f in frames[11:40]:
            done = np.random.rand() > 0.5
            h.add_experience(f, 0, done, 1.0)
            if done:
                h.start_new_episode(start_frame)
            a()

        # adding one more experience should
        # overwrite the oldest experience:
        h.add_experience(frames[40], 1, False, 2.0)
        assert h.rewards[10] == 2.0
        assert h.actions[10] == 1
        with self.assertRaises(AssertionError):
            a()
Ejemplo n.º 6
0
    def test_add_frame(self):
        h = ExperienceHistory(num_frame_stack=num_frame_stack,
                              capacity=size,
                              pic_size=pic_size)

        #can't do anything because no episode started
        with self.assertRaises(AssertionError):
            h.current_state()
        with self.assertRaises(AssertionError):
            h.add_experience(None, None, None, None)

        frames = np.random.rand(4, 2, 2).astype("float32")

        # add the beginning frame
        h.start_new_episode(frames[0])

        # Check that padding works correctly
        assert (h.current_state() == frames[0]).all()
        assert (h.current_state().shape == (num_frame_stack, ) + pic_size)

        # Now add next frame.
        # The action is action taken before this frame
        # and reward is the reward observed for this action
        # done is a flag if we ended in terminal state
        h.add_experience(frames[1], 4, False, 1.0)

        assert (h.current_state()[:2] == frames[0]).all()
        assert (h.current_state()[2] == frames[1]).all()
        assert (h.current_state().shape == (num_frame_stack, ) + pic_size)

        # add one more experience and set episode as finished
        h.add_experience(frames[2], 5, True, 2.0)

        # now there should not be any padding for current state
        assert (h.current_state() == frames[:3]).all()
        assert (h.current_state().shape == (num_frame_stack, ) + pic_size)

        assert np.all(h.next_states[:3] == np.array([[0, 0, 1], [0, 1, 2],
                                                     [-1, -1, -1]]))
        assert np.all(h.prev_states[:3] == np.array([[0, 0, 0], [0, 0, 1],
                                                     [-1, -1, -1]]))

        h.start_new_episode(frames[3])

        assert (h.current_state() == frames[3]).all()
        assert (h.current_state().shape == (num_frame_stack, ) + pic_size)

        batch = h.sample_mini_batch(20)

        # Check that we don't sample from the part which is not yet written
        # i.e shouldn't see zeros (the caches are initialized with zeros)
        assert np.all(np.in1d(batch["reward"], [1., 2.]))
        assert np.all(np.in1d(batch["actions"], [4., 5.]))

        # when we arrived to 2nd frame was the only time when episode was not done
        dm = ~batch["done_mask"].astype(bool)
        assert np.all(batch["next_state"][dm] == np.array(frames[[0, 0, 1]]))

        # frames[2] in the history is overwritten by frames[3] because new episode has started,
        # however it doesn't matter because the terminal state shouldn't be used anywhere.
        assert np.all(batch["next_state"][~dm] == np.array(frames[[0, 1, 3]]))
        assert np.all((batch["prev_state"] == frames[0])
                      | (batch["prev_state"] == frames[1]))