Beispiel #1
0
def update_target_networks(source_model_dict, target_model_dict, tau):
    """
	Helper function to perform target network updates
	:param source_nets: (list) of source networks tf.keras.Module type
	:param target_nets: (list) of target networks
	:param tau: (float) Polyak weight avg. param
	:return:
	"""
    # perform Polyak avg. i.e. "soft" updates
    source_vars, target_vars = [], []
    for model in source_model_dict.keys():
        source_vars = source_vars + list(
            source_model_dict[model].trainable_variables)
        target_vars = target_vars + list(
            target_model_dict[model].trainable_variables)

    # updating target networks
    update_target_variables(target_vars, source_vars, tau)
    return source_vars, target_vars
  def testIncrementalUpdate(self, use_locking):
    """Tests incremental update of the target variables."""
    target_variables = [tf.Variable(tf.random_normal(shape=[1, 2]))]
    source_variables = [tf.Variable(tf.random_normal(shape=[1, 2]))]
    updated = target_update_ops.update_target_variables(
        target_variables, source_variables, tau=0.1, use_locking=use_locking)

    with self.test_session() as sess:
      sess.run(tf.global_variables_initializer())
      before_assign = sess.run(target_variables[0])
      sess.run(updated)
      results = sess.run([target_variables[0], source_variables[0]])
      self.assertAllClose(results[0], 0.1 * results[1] + 0.9 * before_assign)
Beispiel #3
0
    def testIncrementalUpdate(self, use_locking):
        """Tests incremental update of the target variables."""
        target_variables = [tf.Variable(tf.random_normal(shape=[1, 2]))]
        source_variables = [tf.Variable(tf.random_normal(shape=[1, 2]))]
        updated = target_update_ops.update_target_variables(
            target_variables,
            source_variables,
            tau=0.1,
            use_locking=use_locking)

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            before_assign = sess.run(target_variables[0])
            sess.run(updated)
            results = sess.run([target_variables[0], source_variables[0]])
            self.assertAllClose(results[0],
                                0.1 * results[1] + 0.9 * before_assign)
Beispiel #4
0
    def __init__(self, state_dim, action_dim, max_action, lr=3e-4):
        # in the paper, this value was used for Ant-v1, HalfCheetah-v1
        self.scale_reward = 5.0

        self.actor = GaussianActor(state_dim, action_dim, max_action)
        self.actor_optimizer = tf.train.AdamOptimizer(learning_rate=lr)

        self.vf = CriticV(state_dim)
        self.vf_target = CriticV(state_dim)
        self.vf_optimizer = tf.train.AdamOptimizer(learning_rate=lr)

        vf_init = target_update.update_target_variables(self.vf_target.weights, self.vf.weights)
        self.list_init_assign = [vf_init]

        self.qf1 = CriticQ(state_dim, action_dim, name="vq1")
        self.qf2 = CriticQ(state_dim, action_dim, name="vq2")

        self.qf1_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.qf2_optimizer = tf.train.AdamOptimizer(learning_rate=lr)

        # critical session to prevent from accessing concurrent access from explorer
        self.explorer_lock = tf.contrib.framework.CriticalSection()
  def testFullUpdate(self, use_locking):
    """Tests full update of the target variables from the source variables."""
    target_variables = [
        tf.Variable(tf.random_normal(shape=[1, 2])),
        tf.Variable(tf.random_normal(shape=[3, 4])),
    ]
    source_variables = [
        tf.Variable(tf.random_normal(shape=[1, 2])),
        tf.Variable(tf.random_normal(shape=[3, 4])),
    ]
    updated = target_update_ops.update_target_variables(
        target_variables, source_variables, use_locking=use_locking)

    # Collect all the tensors and ops we want to evaluate in the session.
    vars_ops = target_variables + source_variables

    with self.test_session() as sess:
      sess.run(tf.global_variables_initializer())
      sess.run(updated)
      results = sess.run(vars_ops)
      # First target variable is updated with first source variable.
      self.assertAllClose(results[0], results[2])
      # Second target variable is updated with second source variable.
      self.assertAllClose(results[1], results[3])
Beispiel #6
0
    def testFullUpdate(self, use_locking):
        """Tests full update of the target variables from the source variables."""
        target_variables = [
            tf.Variable(tf.random_normal(shape=[1, 2])),
            tf.Variable(tf.random_normal(shape=[3, 4])),
        ]
        source_variables = [
            tf.Variable(tf.random_normal(shape=[1, 2])),
            tf.Variable(tf.random_normal(shape=[3, 4])),
        ]
        updated = target_update_ops.update_target_variables(
            target_variables, source_variables, use_locking=use_locking)

        # Collect all the tensors and ops we want to evaluate in the session.
        vars_ops = target_variables + source_variables

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(updated)
            results = sess.run(vars_ops)
            # First target variable is updated with first source variable.
            self.assertAllClose(results[0], results[2])
            # Second target variable is updated with second source variable.
            self.assertAllClose(results[1], results[3])
Beispiel #7
0
    def __init__(
        self,
        obs_spec: dm_env.specs.Array,
        action_spec: dm_env.specs.BoundedArray,
        ensemble: Sequence[snt.AbstractModule],
        target_ensemble: Sequence[snt.AbstractModule],
        batch_size: int,
        agent_discount: float,
        replay_capacity: int,
        min_replay_size: int,
        sgd_period: int,
        target_update_period: int,
        optimizer: tf.train.Optimizer,
        mask_prob: float,
        noise_scale: float,
        epsilon_fn: Callable[[int], float] = lambda _: 0.,
        seed: int = None,
    ):
        """Bootstrapped DQN with additive prior functions."""
        # Dqn configurations.
        self._ensemble = ensemble
        self._target_ensemble = target_ensemble
        self._num_actions = action_spec.maximum - action_spec.minimum + 1
        self._batch_size = batch_size
        self._sgd_period = sgd_period
        self._target_update_period = target_update_period
        self._min_replay_size = min_replay_size
        self._epsilon_fn = epsilon_fn
        self._replay = replay.Replay(capacity=replay_capacity)
        self._mask_prob = mask_prob
        self._noise_scale = noise_scale
        self._rng = np.random.RandomState(seed)
        tf.set_random_seed(seed)

        self._total_steps = 0
        self._total_episodes = 0
        self._active_head = 0
        self._num_ensemble = len(ensemble)
        assert len(ensemble) == len(target_ensemble)

        # Making the tensorflow graph
        session = tf.Session()

        # Placeholders = (obs, action, reward, discount, next_obs, mask, noise)
        o_tm1 = tf.placeholder(shape=(None, ) + obs_spec.shape,
                               dtype=obs_spec.dtype)
        a_tm1 = tf.placeholder(shape=(None, ), dtype=action_spec.dtype)
        r_t = tf.placeholder(shape=(None, ), dtype=tf.float32)
        d_t = tf.placeholder(shape=(None, ), dtype=tf.float32)
        o_t = tf.placeholder(shape=(None, ) + obs_spec.shape,
                             dtype=obs_spec.dtype)
        m_t = tf.placeholder(shape=(None, self._num_ensemble),
                             dtype=tf.float32)
        z_t = tf.placeholder(shape=(None, self._num_ensemble),
                             dtype=tf.float32)

        losses = []
        value_fns = []
        target_updates = []
        for k in range(self._num_ensemble):
            model = self._ensemble[k]
            target_model = self._target_ensemble[k]
            q_values = model(o_tm1)

            train_value = batched_index(q_values, a_tm1)
            target_value = tf.stop_gradient(
                tf.reduce_max(target_model(o_t), axis=-1))
            target_y = r_t + z_t[:, k] + agent_discount * d_t * target_value
            loss = tf.square(train_value - target_y) * m_t[:, k]

            value_fn = session.make_callable(q_values, [o_tm1])
            target_update = update_target_variables(
                target_variables=target_model.get_all_variables(),
                source_variables=model.get_all_variables(),
            )

            losses.append(loss)
            value_fns.append(value_fn)
            target_updates.append(target_update)

        sgd_op = optimizer.minimize(tf.stack(losses))
        self._value_fns = value_fns
        self._sgd_step = session.make_callable(
            sgd_op, [o_tm1, a_tm1, r_t, d_t, o_t, m_t, z_t])
        self._update_target_nets = session.make_callable(target_updates)
        session.run(tf.global_variables_initializer())
Beispiel #8
0
    def __init__(
        self,
        obs_spec: dm_env.specs.Array,
        action_spec: dm_env.specs.BoundedArray,
        q_network: snt.AbstractModule,
        target_q_network: snt.AbstractModule,
        rho_network: snt.AbstractModule,
        l_network: Sequence[snt.AbstractModule],
        target_l_network: Sequence[snt.AbstractModule],
        batch_size: int,
        discount: float,
        replay_capacity: int,
        min_replay_size: int,
        sgd_period: int,
        target_update_period: int,
        optimizer_primal: tf.train.Optimizer,
        optimizer_dual: tf.train.Optimizer,
        optimizer_l: tf.train.Optimizer,
        learn_iters: int,
        l_approximators: int,
        min_l: float,
        kappa: float,
        eta1: float,
        eta2: float,
        seed: int = None,
    ):
        """Information seeking learner."""
        # ISL configurations.
        self.q_network = q_network
        self._target_q_network = target_q_network
        self.rho_network = rho_network
        self.l_network = l_network
        self._target_l_network = target_l_network
        self._num_actions = action_spec.maximum - action_spec.minimum + 1
        self._obs_shape = obs_spec.shape
        self._batch_size = batch_size
        self._sgd_period = sgd_period
        self._target_update_period = target_update_period
        self._optimizer_primal = optimizer_primal
        self._optimizer_dual = optimizer_dual
        self._optimizer_l = optimizer_l
        self._min_replay_size = min_replay_size
        self._replay = replay.Replay(
            capacity=replay_capacity
        )  #ISLReplay(capacity=replay_capacity, average_l=0, mu=0)  #
        self._rng = np.random.RandomState(seed)
        tf.set_random_seed(seed)
        self._kappa = kappa
        self._min_l = min_l
        self._eta1 = eta1
        self._eta2 = eta2
        self._learn_iters = learn_iters
        self._l_approximators = l_approximators
        self._total_steps = 0
        self._total_episodes = 0
        self._learn_iter_counter = 0

        # Making the tensorflow graph
        o = tf.placeholder(shape=obs_spec.shape, dtype=obs_spec.dtype)
        q = q_network(tf.expand_dims(o, 0))
        rho = rho_network(tf.expand_dims(o, 0))
        l = []
        for k in range(self._l_approximators):
            l.append(
                tf.concat([
                    l_network[k][a](tf.expand_dims(o, 0))
                    for a in range(self._num_actions)
                ],
                          axis=1))

        # Placeholders = (obs, action, reward, discount, next_obs)
        o_tm1 = tf.placeholder(shape=(None, ) + obs_spec.shape,
                               dtype=obs_spec.dtype)
        a_tm1 = tf.placeholder(shape=(None, ), dtype=action_spec.dtype)
        r_t = tf.placeholder(shape=(None, ), dtype=tf.float32)
        d_t = tf.placeholder(shape=(None, ), dtype=tf.float32)
        o_t = tf.placeholder(shape=(None, ) + obs_spec.shape,
                             dtype=obs_spec.dtype)
        chosen_l = tf.placeholder(shape=1,
                                  dtype=tf.int32,
                                  name='chosen_l_tensor')

        q_tm1 = q_network(o_tm1)
        rho_tm1 = rho_network(o_tm1)
        train_q_value = batched_index(q_tm1, a_tm1)
        train_rho_value = batched_index(rho_tm1, a_tm1)
        train_rho_value_no_grad = tf.stop_gradient(train_rho_value)
        if self._target_update_period > 1:
            q_t = target_q_network(o_t)
        else:
            q_t = q_network(o_t)

        l_tm1_all = tf.stack([
            tf.concat([
                self.l_network[k][a](o_tm1) for a in range(self._num_actions)
            ],
                      axis=1) for k in range(self._l_approximators)
        ],
                             axis=-1)
        l_tm1 = tf.squeeze(tf.gather(l_tm1_all, chosen_l, axis=-1), axis=-1)
        train_l_value = batched_index(l_tm1, a_tm1)

        if self._target_update_period > 1:
            l_online_t_all = tf.stack([
                tf.concat([
                    self.l_network[k][a](o_t) for a in range(self._num_actions)
                ],
                          axis=1) for k in range(self._l_approximators)
            ],
                                      axis=-1)
            l_online_t = tf.squeeze(tf.gather(l_online_t_all,
                                              chosen_l,
                                              axis=-1),
                                    axis=-1)
            l_t_all = tf.stack([
                tf.concat([
                    self._target_l_network[k][a](o_t)
                    for a in range(self._num_actions)
                ],
                          axis=1) for k in range(self._l_approximators)
            ],
                               axis=-1)
            l_t = tf.squeeze(tf.gather(l_t_all, chosen_l, axis=-1), axis=-1)
            max_ind = tf.math.argmax(l_online_t, axis=1)
        else:
            l_t_all = tf.stack([
                tf.concat([
                    self.l_network[k][a](o_t) for a in range(self._num_actions)
                ],
                          axis=1) for k in range(self._l_approximators)
            ],
                               axis=-1)
            l_t = tf.squeeze(tf.gather(l_t_all, chosen_l, axis=-1), axis=-1)
            max_ind = tf.math.argmax(l_t, axis=1)

        soft_max_value = tf.stop_gradient(
            tf.py_function(func=self.soft_max, inp=[q_t, l_t],
                           Tout=tf.float32))
        q_target_value = r_t + discount * d_t * soft_max_value
        delta_primal = train_q_value - q_target_value
        loss_primal = tf.add(eta2 * train_rho_value_no_grad * delta_primal,
                             (1 - eta2) * 0.5 * tf.square(delta_primal),
                             name='loss_q')

        delta_dual = tf.stop_gradient(delta_primal)
        loss_dual = tf.square(delta_dual - train_rho_value, name='loss_rho')

        l_greedy_estimate = tf.add((1 - eta1) * tf.math.abs(delta_primal),
                                   eta1 * tf.math.abs(train_rho_value_no_grad),
                                   name='l_greedy_estimate')
        l_target_value = tf.stop_gradient(
            l_greedy_estimate + discount * d_t * batched_index(l_t, max_ind),
            name='l_target')
        loss_l = 0.5 * tf.square(train_l_value - l_target_value)

        train_op_primal = self._optimizer_primal.minimize(loss_primal)
        train_op_dual = self._optimizer_dual.minimize(loss_dual)
        train_op_l = self._optimizer_l.minimize(loss_l)

        # create target update operations
        if self._target_update_period > 1:
            target_updates = []
            target_update = update_target_variables(
                target_variables=self._target_q_network.get_all_variables(),
                source_variables=self.q_network.get_all_variables(),
            )
            target_updates.append(target_update)
            for k in range(self._l_approximators):
                for a in range(self._num_actions):
                    model = self.l_network[k][a]
                    target_model = self._target_l_network[k][a]
                    target_update = update_target_variables(
                        target_variables=target_model.get_all_variables(),
                        source_variables=model.get_all_variables(),
                    )
                    target_updates.append(target_update)

        # Make session and callables.
        session = tf.Session()
        self._sgd = session.make_callable(
            [train_op_l, train_op_primal, train_op_dual],
            [o_tm1, a_tm1, r_t, d_t, o_t, chosen_l])
        self._q_fn = session.make_callable(q, [o])
        self._rho_fn = session.make_callable(rho, [o])
        self._l_fn = []
        for k in range(self._l_approximators):
            self._l_fn.append(session.make_callable(l[k], [o]))
        if self._target_update_period > 1:
            self._update_target_nets = session.make_callable(target_updates)
        session.run(tf.global_variables_initializer())
Beispiel #9
0
    def train(self, states, actions, rewards, next_states, discount_rate, weights, tau=0.005):
        assert len(rewards.shape) == 1
        assert len(discount_rate.shape) == 1
        assert len(weights.shape) == 1

        # Critic Update
        with tf.device("/gpu:0"):
            q1 = self.qf1([states, actions])
            q2 = self.qf2([states, actions])
            vf_next_target_t = self.vf_target(next_states)

            # Equation (7, 8)
            ys = tf.stop_gradient(
                self.scale_reward * rewards + discount_rate * vf_next_target_t
            )

            td_loss1 = tf.reduce_mean(huber_loss(ys - q1) * weights)
            td_loss2 = tf.reduce_mean(huber_loss(ys - q2) * weights)

            # Equation (9)
            q1_grad = tf.gradients(td_loss1, self.qf1.trainable_variables)
            update_q1 = self.qf1_optimizer.apply_gradients(zip(q1_grad, self.qf1.trainable_variables))
            q2_grad = tf.gradients(td_loss2, self.qf2.trainable_variables)
            update_q2 = self.qf2_optimizer.apply_gradients(zip(q2_grad, self.qf2.trainable_variables))

            update_q = tf.group([update_q1, update_q2])

        # Actor Update
        with tf.device("/gpu:0"):
            vf_t = self.vf(states)
            sample_actions, log_pi = self.actor(states)

            tf.contrib.summary.scalar(name="log_pi_min", tensor=tf.reduce_min(log_pi))
            tf.contrib.summary.scalar(name="log_pi_max", tensor=tf.reduce_max(log_pi))

            # TODO lock for explorer_td_error
            with tf.control_dependencies([update_q]):
                q1 = self.qf1([states, sample_actions])
                q2 = self.qf2([states, sample_actions])
            min_q = tf.minimum(q1, q2)

            # Equation (12)
            policy_loss = tf.reduce_mean((log_pi - q1) * weights)

            # Equation (5)
            target_vf = tf.stop_gradient(min_q - log_pi)

            #vf_loss_t = 0.5 * tf.reduce_mean((target_vf - vf_t)**2 * weights)
            vf_loss_t = tf.reduce_mean(huber_loss(target_vf - vf_t) * weights)

            # Equation (6)
            vf_grad = tf.gradients(vf_loss_t, self.vf.trainable_variables)
            update_vf = self.vf_optimizer.apply_gradients(zip(vf_grad, self.vf.trainable_variables))

            # Equation (13)
            actor_grad = tf.gradients(policy_loss, self.actor.trainable_variables)

            # Actor can be accessed from explorer.
            def _update_actor():
                update_actor = self.actor_optimizer.apply_gradients(zip(actor_grad, self.actor.trainable_variables))
                return update_actor

            update_actor = self.explorer_lock.execute(_update_actor)

            with tf.control_dependencies([update_vf]):
                update_vf_target = target_update.update_target_variables(self.vf_target.weights, self.vf.weights,
                                                                         tau)

        updates = tf.group([update_q, update_vf, update_actor, update_vf_target])

        return updates, policy_loss, vf_loss_t, td_loss1