Ejemplo n.º 1
0
def test_MpiAdam():
    np.random.seed(0)
    tf.set_random_seed(0)

    a = tf.Variable(np.random.randn(3).astype('float32'))
    b = tf.Variable(np.random.randn(2, 5).astype('float32'))
    loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))

    stepsize = 1e-2
    update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
    do_update = U.function([], loss, updates=[update_op])

    tf.get_default_session().run(tf.global_variables_initializer())
    for i in range(10):
        print(i, do_update())

    tf.set_random_seed(0)
    tf.get_default_session().run(tf.global_variables_initializer())

    var_list = [a, b]
    lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)],
                             updates=[update_op])
    adam = MpiAdam(var_list)

    for i in range(10):
        l, g = lossandgrad()
        adam.update(g, stepsize)
        print(i, l)
Ejemplo n.º 2
0
def validate_probtype(probtype, pdparam):
    N = 100000
    # Check to see if mean negative log likelihood == differential entropy
    Mval = np.repeat(pdparam[None, :], N, axis=0)
    M = probtype.param_placeholder([N])
    X = probtype.sample_placeholder([N])
    pd = probtype.pdfromflat(M)
    calcloglik = U.function([X, M], pd.logp(X))
    calcent = U.function([M], pd.entropy())
    Xval = tf.get_default_session().run(pd.sample(), feed_dict={M: Mval})
    logliks = calcloglik(Xval, Mval)
    entval_ll = -logliks.mean()  #pylint: disable=E1101
    entval_ll_stderr = logliks.std() / np.sqrt(N)  #pylint: disable=E1101
    entval = calcent(Mval).mean()  #pylint: disable=E1101
    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr  # within 3 sigmas

    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
    M2 = probtype.param_placeholder([N])
    pd2 = probtype.pdfromflat(M2)
    q = pdparam + np.random.randn(pdparam.size) * 0.1
    Mval2 = np.repeat(q[None, :], N, axis=0)
    calckl = U.function([M, M2], pd.kl(pd2))
    klval = calckl(Mval, Mval2).mean()  #pylint: disable=E1101
    logliks = calcloglik(Xval, Mval2)
    klval_ll = -entval - logliks.mean()  #pylint: disable=E1101
    klval_ll_stderr = logliks.std() / np.sqrt(N)  #pylint: disable=E1101
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr  # within 3 sigmas
    print('ok on', probtype, pdparam)
Ejemplo n.º 3
0
def test_function():
    with tf.Graph().as_default():
        x = tf.placeholder(tf.int32, (), name="x")
        y = tf.placeholder(tf.int32, (), name="y")
        z = 3 * x + 2 * y
        lin = function([x, y], z, givens={y: 0})

        with single_threaded_session():
            initialize()

            assert lin(2) == 6
            assert lin(2, 2) == 10
Ejemplo n.º 4
0
def test_multikwargs():
    with tf.Graph().as_default():
        x = tf.placeholder(tf.int32, (), name="x")
        with tf.variable_scope("other"):
            x2 = tf.placeholder(tf.int32, (), name="x")
        z = 3 * x + 2 * x2

        lin = function([x, x2], z, givens={x2: 0})
        with single_threaded_session():
            initialize()
            assert lin(2) == 6
            assert lin(2, 2) == 10
            expt_caught = False
Ejemplo n.º 5
0
    def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
        X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
        vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
        wd_dict = {}
        h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
        h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
        vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
        sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
        wd_loss = tf.get_collection("vf_losses", None)
        loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
        loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
        self._predict = U.function([X], vpred_n)
        optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
                                    clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
                                    async=1, kfac_update=2, cold_iter=50, \
                                    weight_decay_dict=wd_dict, max_grad_norm=None)
        vf_var_list = []
        for var in tf.trainable_variables():
            if "vf" in var.name:
                vf_var_list.append(var)

        update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
        self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
        U.initialize() # Initialize uninitialized TF variables
Ejemplo n.º 6
0
    def __init__(self, epsilon=1e-2, shape=()):

        self._sum = tf.get_variable(dtype=tf.float64,
                                    shape=shape,
                                    initializer=tf.constant_initializer(0.0),
                                    name="runningsum",
                                    trainable=False)
        self._sumsq = tf.get_variable(
            dtype=tf.float64,
            shape=shape,
            initializer=tf.constant_initializer(epsilon),
            name="runningsumsq",
            trainable=False)
        self._count = tf.get_variable(
            dtype=tf.float64,
            shape=(),
            initializer=tf.constant_initializer(epsilon),
            name="count",
            trainable=False)
        self.shape = shape

        self.mean = tf.to_float(self._sum / self._count)
        self.std = tf.sqrt(
            tf.maximum(
                tf.to_float(self._sumsq / self._count) - tf.square(self.mean),
                1e-2))

        newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
        newsumsq = tf.placeholder(shape=self.shape,
                                  dtype=tf.float64,
                                  name='var')
        newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
        self.incfiltparams = U.function(
            [newsum, newsumsq, newcount], [],
            updates=[
                tf.assign_add(self._sum, newsum),
                tf.assign_add(self._sumsq, newsumsq),
                tf.assign_add(self._count, newcount)
            ])
Ejemplo n.º 7
0
 def __init__(self, ob_dim, ac_dim):
     # Here we'll construct a bunch of expressions, which will be used in two places:
     # (1) When sampling actions
     # (2) When computing loss functions, for the policy update
     # Variables specific to (1) have the word "sampled" in them,
     # whereas variables specific to (2) have the word "old" in them
     ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2],
                            name="ob")  # batch of observations
     oldac_na = tf.placeholder(
         tf.float32, shape=[None, ac_dim],
         name="ac")  # batch of actions previous actions
     oldac_dist = tf.placeholder(
         tf.float32, shape=[None, ac_dim * 2], name="oldac_dist"
     )  # batch of actions previous action distributions
     adv_n = tf.placeholder(tf.float32, shape=[None],
                            name="adv")  # advantage function estimate
     wd_dict = {}
     h1 = tf.nn.tanh(
         dense(ob_no,
               64,
               "h1",
               weight_init=U.normc_initializer(1.0),
               bias_init=0.0,
               weight_loss_dict=wd_dict))
     h2 = tf.nn.tanh(
         dense(h1,
               64,
               "h2",
               weight_init=U.normc_initializer(1.0),
               bias_init=0.0,
               weight_loss_dict=wd_dict))
     mean_na = dense(h2,
                     ac_dim,
                     "mean",
                     weight_init=U.normc_initializer(0.1),
                     bias_init=0.0,
                     weight_loss_dict=wd_dict)  # Mean control output
     self.wd_dict = wd_dict
     self.logstd_1a = logstd_1a = tf.get_variable(
         "logstd", [ac_dim], tf.float32,
         tf.zeros_initializer())  # Variance on outputs
     logstd_1a = tf.expand_dims(logstd_1a, 0)
     std_1a = tf.exp(logstd_1a)
     std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
     ac_dist = tf.concat([
         tf.reshape(mean_na, [-1, ac_dim]),
         tf.reshape(std_na, [-1, ac_dim])
     ], 1)
     sampled_ac_na = tf.random_normal(
         tf.shape(ac_dist[:, ac_dim:])
     ) * ac_dist[:,
                 ac_dim:] + ac_dist[:, :
                                    ac_dim]  # This is the sampled action we'll perform.
     logprobsampled_n = -tf.reduce_sum(tf.log(
         ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log(
             2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum(
                 tf.square(ac_dist[:, :ac_dim] - sampled_ac_na) /
                 (tf.square(ac_dist[:, ac_dim:])),
                 axis=1)  # Logprob of sampled action
     logprob_n = -tf.reduce_sum(
         tf.log(ac_dist[:, ac_dim:]), axis=1
     ) - 0.5 * tf.log(2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum(
         tf.square(ac_dist[:, :ac_dim] - oldac_na) /
         (tf.square(ac_dist[:, ac_dim:])),
         axis=1
     )  # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
     kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
     #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
     surr = -tf.reduce_mean(
         adv_n * logprob_n
     )  # Loss function that we'll differentiate to get the policy gradient
     surr_sampled = -tf.reduce_mean(logprob_n)  # Sampled loss of the policy
     self._act = U.function([ob_no],
                            [sampled_ac_na, ac_dist, logprobsampled_n
                             ])  # Generate a new action and its logprob
     #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
     self.compute_kl = U.function([ob_no, oldac_dist], kl)
     self.update_info = (
         (ob_no, oldac_na, adv_n), surr, surr_sampled
     )  # Input and output variables needed for computing loss
     U.initialize()  # Initialize uninitialized TF variables
Ejemplo n.º 8
0
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002):

    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = U.function(inputs, update_op)
    U.initialize()

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner]:
        assert (qr != None)
        enqueue_threads.extend(
            qr.create_threads(tf.get_default_session(),
                              coord=coord,
                              start=True))

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        if kl > desired_kl * 2:
            logger.log("kl too high")
            tf.assign(stepsize, tf.maximum(min_stepsize,
                                           stepsize / 1.5)).eval()
        elif kl < desired_kl / 2:
            logger.log("kl too low")
            tf.assign(stepsize, tf.minimum(max_stepsize,
                                           stepsize * 1.5)).eval()
        else:
            logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)
        if callback:
            callback()
        logger.dump_tabular()
        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)