コード例 #1
0
    def __init__(self,
                 n_states,
                 n_actions,
                 opt,
                 ouprocess=True,
                 mean_var_path=None,
                 supervised=False):
        """ DDPG Algorithms
        Args:
            n_states: int, dimension of states
            n_actions: int, dimension of actions
            opt: dict, params
            supervised, bool, pre-train the actor with supervised learning
        """
        self.n_states = n_states
        self.n_actions = n_actions

        # Params
        self.alr = opt['alr']
        self.clr = opt['clr']
        self.model_name = opt['model']
        self.batch_size = opt['batch_size']
        self.gamma = opt['gamma']
        self.tau = opt['tau']
        self.ouprocess = ouprocess

        if mean_var_path is None:
            mean = np.zeros(n_states)
            var = np.zeros(n_states)
        elif not os.path.exists(mean_var_path):
            mean = np.zeros(n_states)
            var = np.zeros(n_states)
        else:
            with open(mean_var_path, 'rb') as f:
                mean, var = pickle.load(f)

        self.normalizer = Normalizer(mean, var)

        if supervised:
            self._build_actor()
            logger.info("Supervised Learning Initialized")
        else:
            # Build Network
            self._build_network()
            logger.info('Finish Initializing Networks')

        self.replay_memory = PrioritizedReplayMemory(
            capacity=opt['memory_size'])
        # self.replay_memory = ReplayMemory(capacity=opt['memory_size'])
        self.noise = OUProcess(n_actions)
        logger.info('DDPG Initialzed!')
コード例 #2
0
    def __init__(self,
                 sess,
                 env,
                 state_dim,
                 action_dim,
                 max_buffer_size=100000,
                 update_per_iteration=5,
                 mini_batch_size=64,
                 discount=0.99,
                 batch_norm=True,
                 actor_learning_rate=0.0001,
                 critic_learning_rate=0.001,
                 tau=0.001,
                 hidden_layers=[400, 300]):

        self.session = sess
        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_lb = self.env.action_space.low
        self.action_ub = self.env.action_space.high
        self.discount = discount
        self.batch_norm = batch_norm
        self.mini_batch_size = mini_batch_size
        self.update_per_iteration = update_per_iteration
        self.hidden_layers = hidden_layers

        self.replay_buffer = ReplayBuffer(max_buffer_size, state_dim,
                                          action_dim)
        self.exploration = OUProcess(self.action_dim)

        # we define the operations that is used in this algorithms
        self.critic = {}
        self.critic['x'], self.critic['u'], self.critic[
            'is_train'], self.critic['q'], self.critic[
                'variables'] = self.create_critic_network(is_target=False)

        self.target_critic = {}
        self.target_critic['x'], self.target_critic[
            'u'], _, self.target_critic['q'], self.target_critic[
                'variables'] = self.create_critic_network(is_target=True)

        self.actor = {}
        self.actor['x'], self.actor['is_train'], self.actor['a'], self.actor[
            'variables'] = self.create_actor_network(is_target=False)

        self.target_actor = {}
        self.target_actor['x'], _, self.target_actor['a'], self.target_actor[
            'variables'] = self.create_actor_network(is_target=True)

        self.critic_optimization = {}
        with tf.name_scope('critic_optimization'):
            self.critic_optimization['y'] = tf.placeholder(tf.float32,
                                                           shape=(None, 1),
                                                           name='y')
            self.critic_optimization['loss'] = tf.reduce_mean(
                tf.squared_difference(self.critic['q'],
                                      self.critic_optimization['y']),
                name='loss')
            self.critic_optimization['optimize'] = tf.train.AdamOptimizer(
                critic_learning_rate).minimize(
                    self.critic_optimization['loss'])

        # define operation to get y
        self.y_compute = {}
        with tf.name_scope('y'):
            # y = reward + (1-terminal) * gamma * target_q
            self.y_compute['r'] = tf.placeholder(tf.float32, shape=(None, 1))
            self.y_compute['t'] = tf.placeholder(tf.int8, shape=(None, 1))
            self.y_compute['q'] = tf.placeholder(tf.float32, shape=(None, 1))
            temp = tf.to_float(self.y_compute['t'])
            temp = tf.mul(temp, -1.0)
            temp = tf.add(temp, 1.0)
            self.y_compute['y'] = tf.add(
                self.y_compute['r'],
                tf.mul(tf.mul(self.y_compute['q'], self.discount), temp))

        # define the operation to get the gradient of Q with respect to action
        self.action_gradients = {}
        with tf.name_scope('action_grads'):
            self.action_gradients["action_grads"] = tf.gradients(
                self.critic['q'], self.critic['u'])

        self.actor_optimization = {}
        with tf.name_scope('actor_optimization'):
            # first define the placeholder for the gradient of Q with respect to action
            self.actor_optimization['action_grads'] = tf.placeholder(
                tf.float32, shape=(None, self.action_dim))
            # since actor are using gradient ascent, we add the minus sign
            self.actor_optimization['actor_variable_grads'] = tf.gradients(
                self.actor['a'], self.actor['variables'],
                -self.actor_optimization['action_grads'])
            self.actor_optimization['optimize'] = tf.train.AdamOptimizer(
                actor_learning_rate).apply_gradients(
                    zip(self.actor_optimization['actor_variable_grads'],
                        self.actor['variables']))

        self.soft_update_list = []
        with tf.name_scope("soft_update"):
            for source, dest in zip(self.critic['variables'],
                                    self.target_critic['variables']):
                if 'BatchNorm' not in source.name:
                    self.soft_update_list.append(
                        dest.assign(
                            tf.mul(source, tau) + tf.mul(dest, 1.0 - tau)))
            for source, dest in zip(self.actor['variables'],
                                    self.target_actor['variables']):
                if 'BatchNorm' not in source.name:
                    self.soft_update_list.append(
                        dest.assign(
                            tf.mul(source, tau) + tf.mul(dest, 1.0 - tau)))

        # after define the computation, we initialize all the varialbes
        self.session.run(tf.initialize_all_variables())

        summary_writer = tf.train.SummaryWriter('critic.graph',
                                                graph_def=self.session.graph)
コード例 #3
0
class DDPG(object):
    def __init__(self,
                 sess,
                 env,
                 state_dim,
                 action_dim,
                 max_buffer_size=100000,
                 update_per_iteration=5,
                 mini_batch_size=64,
                 discount=0.99,
                 batch_norm=True,
                 actor_learning_rate=0.0001,
                 critic_learning_rate=0.001,
                 tau=0.001,
                 hidden_layers=[400, 300]):

        self.session = sess
        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_lb = self.env.action_space.low
        self.action_ub = self.env.action_space.high
        self.discount = discount
        self.batch_norm = batch_norm
        self.mini_batch_size = mini_batch_size
        self.update_per_iteration = update_per_iteration
        self.hidden_layers = hidden_layers

        self.replay_buffer = ReplayBuffer(max_buffer_size, state_dim,
                                          action_dim)
        self.exploration = OUProcess(self.action_dim)

        # we define the operations that is used in this algorithms
        self.critic = {}
        self.critic['x'], self.critic['u'], self.critic[
            'is_train'], self.critic['q'], self.critic[
                'variables'] = self.create_critic_network(is_target=False)

        self.target_critic = {}
        self.target_critic['x'], self.target_critic[
            'u'], _, self.target_critic['q'], self.target_critic[
                'variables'] = self.create_critic_network(is_target=True)

        self.actor = {}
        self.actor['x'], self.actor['is_train'], self.actor['a'], self.actor[
            'variables'] = self.create_actor_network(is_target=False)

        self.target_actor = {}
        self.target_actor['x'], _, self.target_actor['a'], self.target_actor[
            'variables'] = self.create_actor_network(is_target=True)

        self.critic_optimization = {}
        with tf.name_scope('critic_optimization'):
            self.critic_optimization['y'] = tf.placeholder(tf.float32,
                                                           shape=(None, 1),
                                                           name='y')
            self.critic_optimization['loss'] = tf.reduce_mean(
                tf.squared_difference(self.critic['q'],
                                      self.critic_optimization['y']),
                name='loss')
            self.critic_optimization['optimize'] = tf.train.AdamOptimizer(
                critic_learning_rate).minimize(
                    self.critic_optimization['loss'])

        # define operation to get y
        self.y_compute = {}
        with tf.name_scope('y'):
            # y = reward + (1-terminal) * gamma * target_q
            self.y_compute['r'] = tf.placeholder(tf.float32, shape=(None, 1))
            self.y_compute['t'] = tf.placeholder(tf.int8, shape=(None, 1))
            self.y_compute['q'] = tf.placeholder(tf.float32, shape=(None, 1))
            temp = tf.to_float(self.y_compute['t'])
            temp = tf.mul(temp, -1.0)
            temp = tf.add(temp, 1.0)
            self.y_compute['y'] = tf.add(
                self.y_compute['r'],
                tf.mul(tf.mul(self.y_compute['q'], self.discount), temp))

        # define the operation to get the gradient of Q with respect to action
        self.action_gradients = {}
        with tf.name_scope('action_grads'):
            self.action_gradients["action_grads"] = tf.gradients(
                self.critic['q'], self.critic['u'])

        self.actor_optimization = {}
        with tf.name_scope('actor_optimization'):
            # first define the placeholder for the gradient of Q with respect to action
            self.actor_optimization['action_grads'] = tf.placeholder(
                tf.float32, shape=(None, self.action_dim))
            # since actor are using gradient ascent, we add the minus sign
            self.actor_optimization['actor_variable_grads'] = tf.gradients(
                self.actor['a'], self.actor['variables'],
                -self.actor_optimization['action_grads'])
            self.actor_optimization['optimize'] = tf.train.AdamOptimizer(
                actor_learning_rate).apply_gradients(
                    zip(self.actor_optimization['actor_variable_grads'],
                        self.actor['variables']))

        self.soft_update_list = []
        with tf.name_scope("soft_update"):
            for source, dest in zip(self.critic['variables'],
                                    self.target_critic['variables']):
                if 'BatchNorm' not in source.name:
                    self.soft_update_list.append(
                        dest.assign(
                            tf.mul(source, tau) + tf.mul(dest, 1.0 - tau)))
            for source, dest in zip(self.actor['variables'],
                                    self.target_actor['variables']):
                if 'BatchNorm' not in source.name:
                    self.soft_update_list.append(
                        dest.assign(
                            tf.mul(source, tau) + tf.mul(dest, 1.0 - tau)))

        # after define the computation, we initialize all the varialbes
        self.session.run(tf.initialize_all_variables())

        summary_writer = tf.train.SummaryWriter('critic.graph',
                                                graph_def=self.session.graph)

    def create_actor_network(self, is_target):

        scope = 'tar_actor' if is_target else 'actor'

        with tf.variable_scope(scope):
            x = tf.placeholder(tf.float32,
                               shape=(None, self.state_dim),
                               name='observation')

            # this is used for determine which mode, training or evalutation, for batch normalization
            if self.batch_norm:
                # for target network, is alway evaluation mode
                is_train = False if is_target else tf.placeholder(
                    tf.bool, name='is_train')
            else:
                is_train = None

            net = x

            for hidden_unit_num in self.hidden_layers:
                if self.batch_norm:
                    net = fully_connected(inputs=net,
                                          activation_fn=None,
                                          num_outputs=hidden_unit_num)
                    # NOTE : we set the updates_collections to None to force the updates of mean and variance in place
                    net = batch_norm(inputs=net,
                                     center=True,
                                     scale=True,
                                     activation_fn=tf.nn.relu,
                                     is_training=is_train,
                                     updates_collections=None)
                else:
                    net = fully_connected(inputs=net,
                                          activation_fn=tf.nn.relu,
                                          num_outputs=hidden_unit_num)

            net = fully_connected(
                inputs=net,
                activation_fn=tf.tanh,
                num_outputs=self.action_dim,
                weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3),
                biases_initializer=tf.random_uniform_initializer(-3e-3, 3e-3))

        # get all the trainable variable from this scope
        variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      scope=scope)

        return x, is_train, net, variables

    def create_critic_network(self, is_target):
        scope = 'tar_critic' if is_target else 'critic'

        with tf.variable_scope(scope):
            x = tf.placeholder(tf.float32,
                               shape=(None, self.state_dim),
                               name='observation')
            u = tf.placeholder(tf.float32,
                               shape=(None, self.action_dim),
                               name='actions')

            # this is used for determine which mode, training or evalutation, for batch normalization
            if self.batch_norm:
                # for target network, is alway evaluation mode
                is_train = False if is_target else tf.placeholder(
                    tf.bool, name='is_train')
            else:
                is_train = None

            # first concatenate the input
            # NOTE : this is different architecture from the original paper, we include the action from the first layer
            with tf.name_scope('merge'):
                net = tf.concat(1, [x, u])

            for hidden_unit_num in self.hidden_layers:
                if self.batch_norm:
                    net = fully_connected(inputs=net,
                                          activation_fn=None,
                                          num_outputs=hidden_unit_num)
                    # NOTE : we set the updates_collections to None to force the updates of mean and variance in place
                    net = batch_norm(inputs=net,
                                     center=True,
                                     scale=True,
                                     activation_fn=tf.nn.relu,
                                     is_training=is_train,
                                     updates_collections=None)
                else:
                    net = fully_connected(inputs=net,
                                          activation_fn=tf.nn.relu,
                                          num_outputs=hidden_unit_num)

            net = fully_connected(
                inputs=net,
                activation_fn=None,
                num_outputs=1,
                weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3),
                biases_initializer=tf.random_uniform_initializer(-3e-3, 3e-3))

        # get all the trainable variable from this scope
        variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      scope=scope)

        return x, u, is_train, net, variables

    # define the functions for executing operations
    def predict_target_q(self, x, u):
        return self.session.run(self.target_critic['q'],
                                feed_dict={
                                    self.target_critic['x']: x,
                                    self.target_critic['u']: u
                                })

    def predict_target_action(self, x):
        return self.session.run(self.target_actor['a'],
                                feed_dict={self.target_actor['x']: x})

    def get_y(self, q, r, t):
        return self.session.run(self.y_compute['y'],
                                feed_dict={
                                    self.y_compute['r']: r,
                                    self.y_compute['q']: q,
                                    self.y_compute['t']: t
                                })

    def optimize_critic(self, x, u, is_train, y):
        if self.batch_norm:
            return self.session.run(self.critic_optimization['optimize'],
                                    feed_dict={
                                        self.critic['x']: x,
                                        self.critic['u']: u,
                                        self.critic['is_train']: is_train,
                                        self.critic_optimization['y']: y
                                    })
        else:
            return self.session.run(self.critic_optimization['optimize'],
                                    feed_dict={
                                        self.critic['x']: x,
                                        self.critic['u']: u,
                                        self.critic_optimization['y']: y
                                    })

    def predict_action(self, x, is_train):
        if self.batch_norm:
            return self.session.run(self.actor['a'],
                                    feed_dict={
                                        self.actor['x']: x,
                                        self.actor['is_train']: is_train
                                    })
        else:
            return self.session.run(self.actor['a'],
                                    feed_dict={self.actor['x']: x})

    def action_grads(self, x, u, is_train):
        if self.batch_norm:
            return self.session.run(self.action_gradients["action_grads"],
                                    feed_dict={
                                        self.critic['x']: x,
                                        self.critic['u']: u,
                                        self.critic['is_train']: is_train
                                    })
        else:
            return self.session.run(self.action_gradients["action_grads"],
                                    feed_dict={
                                        self.critic['x']: x,
                                        self.critic['u']: u
                                    })

    def optimize_actor(self, x, a_grads, is_train):
        if self.batch_norm:
            return self.session.run(
                self.actor_optimization['optimize'],
                feed_dict={
                    self.actor['x']: x,
                    self.actor['is_train']: is_train,
                    self.actor_optimization['action_grads']: a_grads
                })
        else:
            return self.session.run(
                self.actor_optimization['optimize'],
                feed_dict={
                    self.actor['x']: x,
                    self.actor_optimization['action_grads']: a_grads
                })

    def soft_update(self):
        self.session.run(self.soft_update_list)

    def get_action(self, s):

        # first make sure the s have the valid form
        s = np.reshape(s, (1, self.state_dim))

        a = self.predict_action(s, False)

        # a is a list with mini_batch size of 1, so we need the first element of is_train
        return self.exploration.add_noise(a[0], self.action_lb, self.action_ub)

    def learn(self, s, a, sprime, r, t):

        # first add the sample to the replay buffer
        self.replay_buffer.add(s, a, sprime, r, t)

        # we start learning if we have enough sample for one minibatch
        if self.replay_buffer.get_size() > self.mini_batch_size:

            # we do the update with several batch in each turn
            for i in xrange(self.update_per_iteration):
                state_set, action_set, sprime_set, reward_set, terminal_set = self.replay_buffer.sample_batch(
                    self.mini_batch_size)

                # first optimize the critic
                # compute Q'
                q = self.predict_target_q(
                    sprime_set, self.predict_target_action(sprime_set))

                # compute y = r + gamma * Q'
                y = self.get_y(q, reward_set, terminal_set)

                # optimize critic using y, and batch normalization
                self.optimize_critic(state_set, action_set, True, y)

                # then optimize the actor
                actions = self.predict_action(state_set, True)
                a_grads = self.action_grads(state_set, actions, False)
                # NOTE: the tf.gradient return a list of len(actions), so we need to take the first element from it
                self.optimize_actor(state_set, a_grads[0], True)

                # using soft update to update target networks
                self.soft_update()

    def reset_exploration(self):
        self.exploration.reset()
コード例 #4
0
    def __init__(self,
                 sess,
                 env,
                 state_dim,
                 action_dim,
                 max_buffer_size=100000,
                 update_per_iteration=5,
                 mini_batch_size=64,
                 discount=0.99,
                 batch_norm=True,
                 learning_rate=1e-3,
                 tau=0.001,
                 hidden_layers=[200, 200]):

        self.session = sess
        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_lb = self.env.action_space.low
        self.action_ub = self.env.action_space.high
        self.discount = discount
        self.batch_norm = batch_norm
        self.mini_batch_size = mini_batch_size
        self.update_per_iteration = update_per_iteration
        self.hidden_layers = hidden_layers

        self.replay_buffer = ReplayBuffer(max_buffer_size, state_dim,
                                          action_dim)
        self.exploration = OUProcess(self.action_dim)

        self.network = {}
        self.network['x'], self.network['u'], self.network['is_train'], self.network['V'], self.network['P'], \
            self.network['M'], self.network['Q'], self.network['variables'] = self.create_networks(is_target=False)

        self.target = {}
        self.target['x'], self.target['u'], _, self.target['V'], self.target['P'], \
            self.target['M'], self.target['Q'], self.target['variables'] = self.create_networks(is_target=True)

        #define optimization operations
        self.network_optimization = {}
        with tf.name_scope('optimization'):
            self.network_optimization['y'] = tf.placeholder(tf.float32,
                                                            shape=(None, 1),
                                                            name='y')
            self.network_optimization['loss'] = tf.reduce_mean(
                tf.squared_difference(self.network['Q'],
                                      self.network_optimization['y']),
                name='loss')
            self.network_optimization['optimize'] = tf.train.AdamOptimizer(
                learning_rate).minimize(self.network_optimization['loss'])

        #define the operations for compute y value
        self.y_compute = {}
        with tf.name_scope('y'):
            # y = reward +  (1-terminal) * gamma * V
            self.y_compute['r'] = tf.placeholder(tf.float32, shape=(None, 1))
            self.y_compute['t'] = tf.placeholder(tf.int8, shape=(None, 1))
            self.y_compute['v'] = tf.placeholder(tf.float32, shape=(None, 1))
            self.y_compute['y'] = tf.to_float(self.y_compute['t'])
            self.y_compute['y'] = tf.mul(self.y_compute['y'], -1.0)
            self.y_compute['y'] = tf.add(self.y_compute['y'], 1.0)
            self.y_compute['y'] = tf.add(
                self.y_compute['r'],
                tf.mul(tf.mul(self.y_compute['v'], self.discount),
                       self.y_compute['y']))

        # define the soft update operation between the normal networks and target networks
        self.soft_update_list = []
        with tf.name_scope('soft_update'):
            for source, dest in zip(self.network['variables'],
                                    self.target['variables']):
                self.soft_update_list.append(
                    dest.assign(tf.mul(source, tau) + tf.mul(dest, 1.0 - tau)))

        # after define the computation, we initialize all the varialbes
        self.session.run(tf.initialize_all_variables())

        summary_writer = tf.train.SummaryWriter('naf.graph',
                                                graph_def=self.session.graph)
コード例 #5
0
class NAF(object):
    def __init__(self,
                 sess,
                 env,
                 state_dim,
                 action_dim,
                 max_buffer_size=100000,
                 update_per_iteration=5,
                 mini_batch_size=64,
                 discount=0.99,
                 batch_norm=True,
                 learning_rate=1e-3,
                 tau=0.001,
                 hidden_layers=[200, 200]):

        self.session = sess
        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_lb = self.env.action_space.low
        self.action_ub = self.env.action_space.high
        self.discount = discount
        self.batch_norm = batch_norm
        self.mini_batch_size = mini_batch_size
        self.update_per_iteration = update_per_iteration
        self.hidden_layers = hidden_layers

        self.replay_buffer = ReplayBuffer(max_buffer_size, state_dim,
                                          action_dim)
        self.exploration = OUProcess(self.action_dim)

        self.network = {}
        self.network['x'], self.network['u'], self.network['is_train'], self.network['V'], self.network['P'], \
            self.network['M'], self.network['Q'], self.network['variables'] = self.create_networks(is_target=False)

        self.target = {}
        self.target['x'], self.target['u'], _, self.target['V'], self.target['P'], \
            self.target['M'], self.target['Q'], self.target['variables'] = self.create_networks(is_target=True)

        #define optimization operations
        self.network_optimization = {}
        with tf.name_scope('optimization'):
            self.network_optimization['y'] = tf.placeholder(tf.float32,
                                                            shape=(None, 1),
                                                            name='y')
            self.network_optimization['loss'] = tf.reduce_mean(
                tf.squared_difference(self.network['Q'],
                                      self.network_optimization['y']),
                name='loss')
            self.network_optimization['optimize'] = tf.train.AdamOptimizer(
                learning_rate).minimize(self.network_optimization['loss'])

        #define the operations for compute y value
        self.y_compute = {}
        with tf.name_scope('y'):
            # y = reward +  (1-terminal) * gamma * V
            self.y_compute['r'] = tf.placeholder(tf.float32, shape=(None, 1))
            self.y_compute['t'] = tf.placeholder(tf.int8, shape=(None, 1))
            self.y_compute['v'] = tf.placeholder(tf.float32, shape=(None, 1))
            self.y_compute['y'] = tf.to_float(self.y_compute['t'])
            self.y_compute['y'] = tf.mul(self.y_compute['y'], -1.0)
            self.y_compute['y'] = tf.add(self.y_compute['y'], 1.0)
            self.y_compute['y'] = tf.add(
                self.y_compute['r'],
                tf.mul(tf.mul(self.y_compute['v'], self.discount),
                       self.y_compute['y']))

        # define the soft update operation between the normal networks and target networks
        self.soft_update_list = []
        with tf.name_scope('soft_update'):
            for source, dest in zip(self.network['variables'],
                                    self.target['variables']):
                self.soft_update_list.append(
                    dest.assign(tf.mul(source, tau) + tf.mul(dest, 1.0 - tau)))

        # after define the computation, we initialize all the varialbes
        self.session.run(tf.initialize_all_variables())

        summary_writer = tf.train.SummaryWriter('naf.graph',
                                                graph_def=self.session.graph)

    def create_networks(self, is_target):

        scope = 'tar_naf' if is_target else 'naf'

        with tf.variable_scope(scope):
            x = tf.placeholder(tf.float32,
                               shape=(None, self.state_dim),
                               name='observation')
            u = tf.placeholder(tf.float32,
                               shape=(None, self.action_dim),
                               name='actions')

            # this is used for determine which mode, training or evalutation, for batch normalization
            if self.batch_norm:
                # for target network, is alway evaluation mode
                is_train = False if is_target else tf.placeholder(
                    tf.bool, name='is_train')
            else:
                is_train = None

            # define operations for the value function
            with tf.variable_scope('V'):
                V = x
                # add in the hidden layers
                for hidden_unit_num in self.hidden_layers:
                    if self.batch_norm:
                        V = fully_connected(inputs=V,
                                            activation_fn=None,
                                            num_outputs=hidden_unit_num)
                        # NOTE : we set the updates_collections to None to force the updates of mean and variance in place
                        V = batch_norm(inputs=V,
                                       center=True,
                                       scale=True,
                                       activation_fn=tf.nn.relu,
                                       is_training=is_train,
                                       updates_collections=None)
                    else:
                        V = fully_connected(inputs=V,
                                            activation_fn=tf.nn.relu,
                                            num_outputs=hidden_unit_num)

                # add in the last layer
                V = fully_connected(inputs=V,
                                    activation_fn=None,
                                    num_outputs=1)

            # define operations for compute covariance matrix
            with tf.variable_scope('L'):
                L = x
                # add in the hidden layers
                for hidden_unit_num in self.hidden_layers:
                    if self.batch_norm:
                        L = fully_connected(inputs=L,
                                            activation_fn=None,
                                            num_outputs=hidden_unit_num)
                        # NOTE : we set the updates_collections to None to force the updates of mean and variance in place
                        L = batch_norm(inputs=L,
                                       center=True,
                                       scale=True,
                                       activation_fn=tf.nn.relu,
                                       is_training=is_train,
                                       updates_collections=None)
                    else:
                        L = fully_connected(inputs=L,
                                            activation_fn=tf.nn.relu,
                                            num_outputs=hidden_unit_num)

                L = fully_connected(inputs=L,
                                    activation_fn=None,
                                    num_outputs=(self.action_dim *
                                                 (self.action_dim + 1) / 2))

                #construct upper triangular matrix U
                pivot = 0
                rows = []
                for index in xrange(self.action_dim):
                    count = self.action_dim - index

                    # slice one element at point pivot from the second dimension and apply exp to it
                    # NOTE, first dimension indicate the batch, -1 means all element in this dimension are in slice
                    diag_elem = tf.exp(tf.slice(L, (0, pivot), (-1, 1)))

                    # slice the next count - 1 element from the second dimension
                    # count is the number of non-zero element in each row
                    # NOTE: index getting bigger, so count get smaller
                    non_diag_elems = tf.slice(L, (0, pivot + 1),
                                              (-1, count - 1))

                    # concate the tensor to form one row of the matrix
                    non_zero_elements = tf.concat(1,
                                                  (diag_elem, non_diag_elems))

                    # ((0, 0), (index, 0)) is the paddings
                    # since we have two-d matrix, so the tuple has two elements
                    # for the first (0,0), specify the first dimension
                    # the first 0 means padding nothing, the second 0 means padding before the elements (-1 means after)
                    # (index, 0) specify the padding for second dimension, which is what we want
                    # (index, 0) mean padding index number before the elements
                    row = tf.pad(non_zero_elements, ((0, 0), (index, 0)))
                    rows.append(row)

                    # take off the elements we already used
                    pivot += count

                # Packs a list of rank-R tensors into one rank-(R+1) tensor.
                # axis = 1 mean the second dimensions
                # NOTE : this will get upper triangular matrix U not L
                L = tf.pack(rows, axis=1)

                # convariance matrix P = L*L^{T} = U^{T}*U
                P = tf.batch_matmul(tf.transpose(L, perm=[0, 2, 1]), L)

            # define operations for compute Mu
            with tf.variable_scope('M'):
                M = x
                # add in the hidden layers
                for hidden_unit_num in self.hidden_layers:
                    if self.batch_norm:
                        M = fully_connected(inputs=M,
                                            activation_fn=None,
                                            num_outputs=hidden_unit_num)
                        # NOTE : we set the updates_collections to None to force the updates of mean and variance in place
                        # see https://github.com/tensorflow/tensorflow/issues/1122
                        M = batch_norm(inputs=M,
                                       center=True,
                                       scale=True,
                                       activation_fn=tf.nn.relu,
                                       is_training=is_train,
                                       updates_collections=None)
                    else:
                        M = fully_connected(inputs=M,
                                            activation_fn=tf.nn.relu,
                                            num_outputs=hidden_unit_num)

                # add in the last layer
                M = fully_connected(inputs=M,
                                    activation_fn=tf.tanh,
                                    num_outputs=self.action_dim)

            #define operations for compute Advantage function
            with tf.name_scope('A'):
                # first expand the u-M to a 2-d tensor for multiplication
                # NOTE: it's actually a 3-d tensor, but we ignore the first dim which is the batch
                # u is two-d matrix, first dimension is the batch
                # so u is actually a row vector after expand_dim
                Aprime = tf.expand_dims(u - M, -1)
                # fix the dimension for batch, transpose each instance
                A = tf.transpose(Aprime, perm=[0, 2, 1])
                # A = -1/2 * (u-M)^{T} * P * (u-M)
                A = -tf.batch_matmul(tf.batch_matmul(Aprime, P), A) / 2
                # make sure the shape is batch_size *  1 for A, -1 mean that dim is automatically computed
                # after last step, each A is now a 1*1 matrix, we reshape it to get scalar
                A = tf.reshape(A, [-1, 1])

            with tf.name_scope('Q'):
                Q = A + V

        # get all the trainable variable from this scope
        variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      scope=scope)

        #return x, u, is_train, V, P, M, Q, variables
        return x, u, is_train, V, P, M, Q, variables

    def predict_target_v(self, x):
        return self.session.run(self.target['V'],
                                feed_dict={self.target['x']: x})

    def get_y(self, v, r, t):
        return self.session.run(self.y_compute['y'],
                                feed_dict={
                                    self.y_compute['r']: r,
                                    self.y_compute['v']: v,
                                    self.y_compute['t']: t
                                })

    def optimize_network(self, x, u, is_train, y):
        if self.batch_norm:
            feed_dict = {
                self.network['x']: x,
                self.network['u']: u,
                self.network['is_train']: is_train,
                self.network_optimization['y']: y
            }
        else:
            feed_dict = {
                self.network['x']: x,
                self.network['u']: u,
                self.network_optimization['y']: y
            }

        return self.session.run(self.network_optimization['optimize'],
                                feed_dict=feed_dict)

    def predict_action(self, x, is_train):
        if self.batch_norm:
            feed_dict = {
                self.network['x']: x,
                self.network['is_train']: is_train
            }
        else:
            feed_dict = {self.network['x']: x}
        return self.session.run([self.network['M'], self.network['P']],
                                feed_dict=feed_dict)

    def get_action(self, s):

        s = np.reshape(s, (1, self.state_dim))

        a, covariance = self.predict_action(s, False)

        return self.exploration.add_noise(a[0], self.action_lb, self.action_ub)

    def soft_update(self):
        self.session.run(self.soft_update_list)

    def learn(self, s, a, sprime, r, terminal):
        # first add the sample to the replay buffer
        self.replay_buffer.add(s, a, sprime, r, terminal)

        # we start learning if we have enough sample for one minibatch
        if self.replay_buffer.get_size() > self.mini_batch_size:

            # we do the update with several batch in each turn
            for i in xrange(self.update_per_iteration):
                state_set, action_set, sprime_set, reward_set, terminal_set = self.replay_buffer.sample_batch(
                    self.mini_batch_size)

                # compute V'
                v = self.predict_target_v(sprime_set)

                # compute y = r + gamma * V'
                y = self.get_y(v, reward_set, terminal_set)

                # optimize critic using y, and batch normalization
                self.optimize_network(state_set, action_set, True, y)

                # using soft update to update target networks
                self.soft_update()

    def reset_exploration(self):
        self.exploration.reset()
コード例 #6
0
class DDPG(object):
    def __init__(self,
                 n_states,
                 n_actions,
                 opt,
                 ouprocess=True,
                 mean_var_path=None,
                 supervised=False):
        """ DDPG Algorithms
        Args:
            n_states: int, dimension of states
            n_actions: int, dimension of actions
            opt: dict, params
            supervised, bool, pre-train the actor with supervised learning
        """
        self.n_states = n_states
        self.n_actions = n_actions

        # Params
        self.alr = opt['alr']
        self.clr = opt['clr']
        self.model_name = opt['model']
        self.batch_size = opt['batch_size']
        self.gamma = opt['gamma']
        self.tau = opt['tau']
        self.ouprocess = ouprocess

        if mean_var_path is None:
            mean = np.zeros(n_states)
            var = np.zeros(n_states)
        elif not os.path.exists(mean_var_path):
            mean = np.zeros(n_states)
            var = np.zeros(n_states)
        else:
            with open(mean_var_path, 'rb') as f:
                mean, var = pickle.load(f)

        self.normalizer = Normalizer(mean, var)

        if supervised:
            self._build_actor()
            logger.info("Supervised Learning Initialized")
        else:
            # Build Network
            self._build_network()
            logger.info('Finish Initializing Networks')

        self.replay_memory = PrioritizedReplayMemory(
            capacity=opt['memory_size'])
        # self.replay_memory = ReplayMemory(capacity=opt['memory_size'])
        self.noise = OUProcess(n_actions)
        logger.info('DDPG Initialzed!')

    @staticmethod
    def totensor(x):
        return Variable(torch.FloatTensor(x))

    def _build_actor(self):
        if self.ouprocess:
            noisy = False
        else:
            noisy = True
        self.actor = Actor(self.n_states, self.n_actions, noisy=noisy)
        self.actor_criterion = nn.MSELoss()
        self.actor_optimizer = optimizer.Adam(lr=self.alr,
                                              params=self.actor.parameters())

    def _build_network(self):
        if self.ouprocess:
            noisy = False
        else:
            noisy = True
        self.actor = Actor(self.n_states, self.n_actions, noisy=noisy)
        self.target_actor = Actor(self.n_states, self.n_actions)
        self.critic = Critic(self.n_states, self.n_actions)
        self.target_critic = Critic(self.n_states, self.n_actions)

        # if model params are provided, load them
        if len(self.model_name):
            self.load_model(model_name=self.model_name)
            logger.info("Loading model from file: {}".format(self.model_name))

        # Copy actor's parameters
        self._update_target(self.target_actor, self.actor, tau=1.0)

        # Copy critic's parameters
        self._update_target(self.target_critic, self.critic, tau=1.0)

        self.loss_criterion = nn.MSELoss()
        self.actor_optimizer = optimizer.Adam(lr=self.alr,
                                              params=self.actor.parameters(),
                                              weight_decay=1e-5)
        self.critic_optimizer = optimizer.Adam(lr=self.clr,
                                               params=self.critic.parameters(),
                                               weight_decay=1e-5)

    @staticmethod
    def _update_target(target, source, tau):
        for (target_param, param) in zip(target.parameters(),
                                         source.parameters()):
            target_param.data.copy_(target_param.data * (1 - tau) +
                                    param.data * tau)

    def reset(self, sigma):
        self.noise.reset(sigma)

    def _sample_batch(self):
        batch, idx = self.replay_memory.sample(self.batch_size)
        # batch = self.replay_memory.sample(self.batch_size)
        states = map(lambda x: x[0].tolist(), batch)
        next_states = map(lambda x: x[3].tolist(), batch)
        actions = map(lambda x: x[1].tolist(), batch)
        rewards = map(lambda x: x[2], batch)
        terminates = map(lambda x: x[4], batch)

        return idx, states, next_states, actions, rewards, terminates

    def add_sample(self, state, action, reward, next_state, terminate):
        self.critic.eval()
        self.actor.eval()
        self.target_critic.eval()
        self.target_actor.eval()
        batch_state = self.normalizer([state.tolist()])
        batch_next_state = self.normalizer([next_state.tolist()])
        current_value = self.critic(batch_state,
                                    self.totensor([action.tolist()]))
        target_action = self.target_actor(batch_next_state)
        target_value = self.totensor([reward]) \
            + self.totensor([0 if x else 1 for x in [terminate]]) \
            * self.target_critic(batch_next_state, target_action) * self.gamma
        error = float(torch.abs(current_value - target_value).data.numpy()[0])

        self.target_actor.train()
        self.actor.train()
        self.critic.train()
        self.target_critic.train()
        self.replay_memory.add(error,
                               (state, action, reward, next_state, terminate))

    def update(self):
        """ Update the Actor and Critic with a batch data
        """
        idxs, states, next_states, actions, rewards, terminates = self._sample_batch(
        )
        batch_states = self.normalizer(states)  # totensor(states)
        batch_next_states = self.normalizer(
            next_states)  # Variable(torch.FloatTensor(next_states))
        batch_actions = self.totensor(actions)
        batch_rewards = self.totensor(rewards)
        mask = [0 if x else 1 for x in terminates]
        mask = self.totensor(mask)

        target_next_actions = self.target_actor(batch_next_states).detach()
        target_next_value = self.target_critic(
            batch_next_states, target_next_actions).detach().squeeze(1)

        current_value = self.critic(batch_states, batch_actions)
        next_value = batch_rewards + mask * target_next_value * self.gamma
        # Update Critic

        # update prioritized memory
        error = torch.abs(current_value - next_value).data.numpy()
        for i in range(self.batch_size):
            idx = idxs[i]
            self.replay_memory.update(idx, error[i][0])

        loss = self.loss_criterion(current_value, next_value)
        self.critic_optimizer.zero_grad()
        loss.backward()
        self.critic_optimizer.step()

        # Update Actor
        self.critic.eval()
        policy_loss = -self.critic(batch_states, self.actor(batch_states))
        policy_loss = policy_loss.mean()
        self.actor_optimizer.zero_grad()
        policy_loss.backward()

        self.actor_optimizer.step()
        self.critic.train()

        self._update_target(self.target_critic, self.critic, tau=self.tau)
        self._update_target(self.target_actor, self.actor, tau=self.tau)

        return loss.data[0], policy_loss.data[0]

    def choose_action(self, x):
        """ Select Action according to the current state
        Args:
            x: np.array, current state
        """
        self.actor.eval()
        act = self.actor(self.normalizer([x.tolist()])).squeeze(0)
        self.actor.train()
        action = act.data.numpy()
        if self.ouprocess:
            action += self.noise.noise()
        return action.clip(0, 1)

    def sample_noise(self):
        self.actor.sample_noise()

    def load_model(self, model_name):
        """ Load Torch Model from files
        Args:
            model_name: str, model path
        """
        self.actor.load_state_dict(
            torch.load('{}_actor.pth'.format(model_name)))
        self.critic.load_state_dict(
            torch.load('{}_critic.pth'.format(model_name)))

    def save_model(self, model_dir, title):
        """ Save Torch Model from files
        Args:
            model_dir: str, model dir
            title: str, model name
        """
        torch.save(self.actor.state_dict(),
                   '{}/{}_actor.pth'.format(model_dir, title))

        torch.save(self.critic.state_dict(),
                   '{}/{}_critic.pth'.format(model_dir, title))

    def save_actor(self, path):
        """ save actor network
        Args:
             path, str, path to save
        """
        torch.save(self.actor.state_dict(), path)

    def load_actor(self, path):
        """ load actor network
        Args:
             path, str, path to load
        """
        self.actor.load_state_dict(torch.load(path))

    def train_actor(self, batch_data, is_train=True):
        """ Train the actor separately with data
        Args:
            batch_data: tuple, (states, actions)
            is_train: bool
        Return:
            _loss: float, training loss
        """
        states, action = batch_data

        if is_train:
            self.actor.train()
            pred = self.actor(self.normalizer(states))
            action = self.totensor(action)

            _loss = self.actor_criterion(pred, action)

            self.actor_optimizer.zero_grad()
            _loss.backward()
            self.actor_optimizer.step()

        else:
            self.actor.eval()
            pred = self.actor(self.normalizer(states))
            action = self.totensor(action)
            _loss = self.actor_criterion(pred, action)

        return _loss.data[0]