Exemple #1
0
def test_MpiAdam():
    np.random.seed(0)
    tf.set_random_seed(0)

    a = tf.Variable(np.random.randn(3).astype('float32'))
    b = tf.Variable(np.random.randn(2, 5).astype('float32'))
    loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))

    stepsize = 1e-2
    update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
    do_update = U.function([], loss, updates=[update_op])

    tf.get_default_session().run(tf.global_variables_initializer())
    for i in range(10):
        print(i, do_update())

    tf.set_random_seed(0)
    tf.get_default_session().run(tf.global_variables_initializer())

    var_list = [a, b]
    lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)],
                             updates=[update_op])
    adam = MpiAdam(var_list)

    for i in range(10):
        l, g = lossandgrad()
        adam.update(g, stepsize)
        print(i, l)
Exemple #2
0
def validate_probtype(probtype, pdparam):
    N = 100000
    # Check to see if mean negative log likelihood == differential entropy
    Mval = np.repeat(pdparam[None, :], N, axis=0)
    M = probtype.param_placeholder([N])
    X = probtype.sample_placeholder([N])
    pd = probtype.pdclass()(M)
    calcloglik = U.function([X, M], pd.logp(X))
    calcent = U.function([M], pd.entropy())
    Xval = U.eval(pd.sample(), feed_dict={M: Mval})
    logliks = calcloglik(Xval, Mval)
    entval_ll = -logliks.mean()  #pylint: disable=E1101
    entval_ll_stderr = logliks.std() / np.sqrt(N)  #pylint: disable=E1101
    entval = calcent(Mval).mean()  #pylint: disable=E1101
    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr  # within 3 sigmas

    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
    M2 = probtype.param_placeholder([N])
    pd2 = probtype.pdclass()(M2)
    q = pdparam + np.random.randn(pdparam.size) * 0.1
    Mval2 = np.repeat(q[None, :], N, axis=0)
    calckl = U.function([M, M2], pd.kl(pd2))
    klval = calckl(Mval, Mval2).mean()  #pylint: disable=E1101
    logliks = calcloglik(Xval, Mval2)
    klval_ll = -entval - logliks.mean()  #pylint: disable=E1101
    klval_ll_stderr = logliks.std() / np.sqrt(N)  #pylint: disable=E1101
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr  # within 3 sigmas
    def __init__(self,
                 name,
                 ob,
                 ac_space,
                 hid_size,
                 num_hid_layers,
                 num_subpolicies,
                 gaussian_fixed_var=True):
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.num_subpolicies = num_subpolicies
        self.gaussian_fixed_var = gaussian_fixed_var
        self.num_subpolicies = num_subpolicies

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], ))
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            # obz = ob

            # value function
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    U.dense(last_out,
                            hid_size,
                            "vffc%i" % (i + 1),
                            weight_init=U.normc_initializer(1.0)))
            self.vpred = U.dense(last_out,
                                 1,
                                 "vffinal",
                                 weight_init=U.normc_initializer(1.0))[:, 0]

            # master policy
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    U.dense(last_out,
                            hid_size,
                            "masterpol%i" % (i + 1),
                            weight_init=U.normc_initializer(1.0)))
            self.selector = U.dense(last_out,
                                    num_subpolicies, "masterpol_final",
                                    U.normc_initializer(0.01))
            self.pdtype = pdtype = CategoricalPdType(num_subpolicies)
            self.pd = pdtype.pdfromflat(self.selector)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

        # debug
        self._debug = U.function([stochastic, ob], [ac, self.selector])
        self._act_forced = U.function([stochastic, ob, self.selector],
                                      [ac, self.vpred])
Exemple #4
0
    def __init__(self, env, master_policy,old_master_policy, sub_policies, old_sub_policies,
                comm, clip_param=0.2, entcoeff=0, optim_epochs=10,
                optim_stepsize=3e-4, optim_batchsize=64):
        self.clip_param = clip_param
        self.entcoeff = entcoeff
        self.optim_epochs = optim_epochs
        self.optim_stepsize = optim_stepsize
        self.optim_batchsize = optim_batchsize
        self.num_subpolicies = len(sub_policies)
        self.sub_policies = sub_policies
        self.master_policy = master_policy
        ob_space = env.observation_space
        ac_space = env.action_space
        self.sp_ac = sub_policies[0].pdtype.sample_placeholder([None])
        atarg = tf.placeholder(dtype=tf.float32, shape=[None])
        ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
        # for training theta
        # inputs for training theta
        ob = U.get_placeholder_cached(name="ob")
        ob_master = U.get_placeholder_cached(name="adv_ob")
        ac_master = master_policy.pdtype.sample_placeholder([None])
        loss_master = self.policy_loss_master(master_policy, old_master_policy, ob_master, ac_master, atarg, ret, clip_param)
        self.master_policy_var_list = master_policy.get_trainable_variables()
        self.master_loss = U.function([ob_master, ac_master, atarg, ret], U.flatgrad(loss_master, self.master_policy_var_list))
        self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm)


        self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
            for (oldv, newv) in zipsame(old_master_policy.get_variables(), master_policy.get_variables())])


        self.assign_subs = []
        self.change_subs = []
        self.adams = []
        self.losses = []



        for i in range(self.num_subpolicies):
            varlist = sub_policies[i].get_trainable_variables()
            self.adams.append(MpiAdam(varlist))
            # loss for test
            loss = self.policy_loss(sub_policies[i], sub_policies[(i-1)%2], old_sub_policies[i], ob, self.sp_ac, atarg, ret, clip_param)
            self.losses.append(U.function([ob, self.sp_ac, atarg, ret], U.flatgrad(loss, varlist)))

            self.assign_subs.append(U.function([],[], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(old_sub_policies[i].get_variables(), sub_policies[i].get_variables())]))
            self.zerograd = U.function([], self.nograd(varlist))

        U.initialize()

        self.master_adam.sync()
        for i in range(self.num_subpolicies):
            self.adams[i].sync()
    def __init__(self,
                 name,
                 ob,
                 ac_space,
                 network='mlp',
                 gaussian_fixed_var=True,
                 nsteps=None,
                 nbatch=None,
                 nlstm=256,
                 states=None,
                 masks=None,
                 reuse=False):
        self.network = network

        shape = []
        for d in range(1, len(ob.shape)):
            shape.append(ob.shape[d])

        with tf.variable_scope(name, reuse=reuse):
            self.scope = tf.get_variable_scope().name

            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=shape)
            obs = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)

            if network == 'mlp':
                hid_size = 64
                num_hid_layers = 2
                self.hid_size = hid_size
                self.num_hid_layers = num_hid_layers
                self.gaussian_fixed_var = gaussian_fixed_var
                self._mlp(obs, hid_size, num_hid_layers, ac_space,
                          gaussian_fixed_var)
            elif network == 'cnn':
                self._cnn(obs, ac_space, gaussian_fixed_var)
            elif network == 'lstm':
                assert nsteps is not None and nbatch is not None
                assert states is not None and masks is not None
                assert isinstance(nsteps, int) and isinstance(nbatch, int)
                assert nsteps > 0 and nbatch > 0
                self._lstm(obs, states, masks, nlstm, ac_space, nbatch, nsteps)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        if network == 'mlp' or network == 'cnn':
            self._act = U.function([stochastic, ob], [ac, self.vpred])
        elif network == 'lstm':
            self._act = U.function([stochastic, ob, states, masks],
                                   [ac, self.vpred, self.snew])
Exemple #6
0
    def __init__(self, epsilon=1e-2, shape=()):

        self._sum = tf.get_variable(dtype=tf.float64,
                                    shape=shape,
                                    initializer=tf.constant_initializer(0.0),
                                    name="runningsum",
                                    trainable=False)
        self._sumsq = tf.get_variable(
            dtype=tf.float64,
            shape=shape,
            initializer=tf.constant_initializer(epsilon),
            name="runningsumsq",
            trainable=False)
        self._count = tf.get_variable(
            dtype=tf.float64,
            shape=(),
            initializer=tf.constant_initializer(epsilon),
            name="count",
            trainable=False)
        self.shape = shape

        self.mean = tf.to_float(self._sum / self._count)
        self.std = tf.sqrt(
            tf.maximum(
                tf.to_float(self._sumsq / self._count) - tf.square(self.mean),
                1e-2))

        newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
        newsumsq = tf.placeholder(shape=self.shape,
                                  dtype=tf.float64,
                                  name='var')
        newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
        self.incfiltparams = U.function(
            [newsum, newsumsq, newcount], [],
            updates=[
                tf.assign_add(self._sum, newsum),
                tf.assign_add(self._sumsq, newsumsq),
                tf.assign_add(self._count, newcount)
            ])

        self.debug = U.function([], [self.mean, self.std])
    def __init__(self,
                 name,
                 ob,
                 ac_space,
                 num_subpolicies,
                 network='mlp',
                 gaussian_fixed_var=True):
        self.num_subpolicies = num_subpolicies
        self.gaussian_fixed_var = gaussian_fixed_var
        shape = []
        for d in range(1, len(ob.shape)):
            shape.append(ob.shape[d])

        with tf.variable_scope("obfilter", reuse=tf.AUTO_REUSE):
            self.ob_rms = RunningMeanStd(shape=shape)
        obs = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name

            if network == 'mlp':
                hid_size = 64
                num_hid_layers = 2
                self.hid_size = hid_size
                self.num_hid_layers = num_hid_layers
                self._mlp(obs, num_subpolicies, hid_size, num_hid_layers,
                          ac_space, gaussian_fixed_var)
            elif network == 'cnn':
                self._cnn(obs, num_subpolicies)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

        # debug
        self._debug = U.function([stochastic, ob], [ac, self.selector])
        self._act_forced = U.function([stochastic, ob, self.selector],
                                      [ac, self.vpred])
Exemple #8
0
    def __init__(self, env, policy, old_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64):
        self.policy = policy
        self.clip_param = clip_param
        self.entcoeff = entcoeff
        self.optim_epochs = optim_epochs
        self.optim_stepsize = optim_stepsize
        self.optim_batchsize = optim_batchsize
        self.num_subpolicies = len(sub_policies)
        self.sub_policies = sub_policies
        ob_space = env.observation_space
        ac_space = env.action_space
        if WRITE_SCALAR:
            self.scalar_writer = tf.summary.FileWriter(osp.join("savedir/",'checkpoints', 'scalar%d' % time.time()))

        # for training theta
        # inputs for training theta
        ob = U.get_placeholder_cached(name="ob")
        ac = policy.pdtype.sample_placeholder([None])
        atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
        ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
        total_loss = self.policy_loss(policy, old_policy, ob, ac, atarg, ret, clip_param)
        self.master_policy_var_list = policy.get_trainable_variables()
        self.master_loss = U.function([ob, ac, atarg, ret], U.flatgrad(total_loss, self.master_policy_var_list))
        self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm)
        summ = tf.summary.scalar("total_loss", total_loss)
        self.calc_summary = U.function([ob, ac, atarg, ret],[summ])


        self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
            for (oldv, newv) in zipsame(old_policy.get_variables(), policy.get_variables())])

        self.assign_subs = []
        self.change_subs = []
        self.adams = []
        self.losses = []
        self.sp_ac = sub_policies[0].pdtype.sample_placeholder([None])
        for i in range(self.num_subpolicies):
            varlist = sub_policies[i].get_trainable_variables()
            self.adams.append(MpiAdam(varlist))
            # loss for test
            loss = self.policy_loss(sub_policies[i], old_sub_policies[i], ob, self.sp_ac, atarg, ret, clip_param)
            self.losses.append(U.function([ob, self.sp_ac, atarg, ret], U.flatgrad(loss, varlist)))

            self.assign_subs.append(U.function([],[], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(old_sub_policies[i].get_variables(), sub_policies[i].get_variables())]))
            self.zerograd = U.function([], self.nograd(varlist))

        U.initialize()

        self.master_adam.sync()
        for i in range(self.num_subpolicies):
            self.adams[i].sync()
Exemple #9
0
    def __init__(self,
                 name,
                 ob,
                 hid_size,
                 num_hid_layers,
                 gaussian_fixed_var=True):
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.gaussian_fixed_var = gaussian_fixed_var

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name
            with tf.variable_scope("obfilter"):
                if (len(ob.shape) == 2):
                    self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], ))
                elif (len(ob.shape) == 3):
                    self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1] *
                                                        ob.get_shape()[2]))
                elif (len(ob.shape) == 4):
                    self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1] *
                                                        ob.get_shape()[2] *
                                                        ob.get_shape()[3]))
            #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            obz = ob

            # value function
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    U.dense(last_out,
                            hid_size,
                            "vffc%i" % (i + 1),
                            weight_init=U.normc_initializer(1.0)))
            self.vpred = tf.clip_by_value(
                U.sum(
                    U.dense(last_out,
                            1,
                            "vffinal",
                            weight_init=U.normc_initializer(1.0))[:, 0]), 0.0,
                1000.0)

        # sample actions
        self._act = U.function([ob], [self.vpred])
Exemple #10
0
    def __init__(self, name, ob, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.gaussian_fixed_var = gaussian_fixed_var

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name

            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1],))
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            # obz = ob

            # value function
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

            # sub policy
            self.pdtype = pdtype = make_pdtype(ac_space)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(U.dense(last_out, hid_size, "pol%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                self.pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            else:
                self.pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(self.pdparam)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #11
0
    def __init__(self, envs, policies, sub_policies, old_policies, old_sub_policies, 
            clip_param=0.2, vfcoeff=1., entcoeff=0, divcoeff=0., optim_epochs=10, 
            master_lr=1e-3, sub_lr=3e-4, optim_batchsize=64, envsperbatch=None, 
            num_rollouts=None, nlstm=256, recurrent=False):
        self.policies = policies
        self.sub_policies = sub_policies
        self.old_policies = old_policies
        self.old_sub_policies = old_sub_policies
        self.clip_param = clip_param
        self.entcoeff = entcoeff
        self.optim_epochs = optim_epochs
        self.optim_batchsize = optim_batchsize
        self.num_master_groups = num_master_groups = len(policies)
        self.num_subpolicies = num_subpolicies = len(sub_policies)
        self.ob_space = envs[0].observation_space
        self.ac_space = envs[0].action_space
        self.nbatch = nbatch = num_rollouts * envsperbatch
        self.envsperbatch = envsperbatch

        self.master_obs = [U.get_placeholder(name="master_ob_%i"%x, dtype=tf.float32,
            shape=[None] + list(self.ob_space.shape)) for x in range(num_master_groups)]
        self.master_acs = [policies[0].pdtype.sample_placeholder([None]) 
                for _ in range(num_master_groups)]
        self.master_atargs = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_master_groups)]
        self.master_ret = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_master_groups)]
        retvals = zip(*[self.policy_loss(policies[i], 
            old_policies[i], self.master_obs[i], self.master_acs[i], self.master_atargs[i], 
            self.master_ret[i], clip_param, mask=tf.constant(1.), vfcoeff=vfcoeff, 
            entcoeff=entcoeff) for i in range(num_master_groups)])
        self.master_losses, self.master_kl, self.master_pol_surr, self.master_vf_loss, \
                self.master_entropy, self.master_values, _ = retvals 

        master_trainers = [tf.train.AdamOptimizer(learning_rate=master_lr, 
            name='master_adam_%i'%_) for _ in range(num_master_groups)]
        master_params = [policies[i].get_trainable_variables() 
                for i in range(num_master_groups)] 
        master_grads = [tf.gradients(self.master_losses[i], master_params[i])
                for i in range(num_master_groups)]
        master_grads = [list(zip(g, p)) for g, p in zip(master_grads, master_params)]
        # TODO: gradient clipping
        self.assign_old_eq_new = [U.function([],[], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(old_policies[i].get_variables(), 
                policies[i].get_variables())]) for i in range(num_master_groups)]
        self.master_train_steps = [master_trainers[i].apply_gradients(master_grads[i])
                for i in range(num_master_groups)]
       

        if not recurrent:
            self.sub_obs = [U.get_placeholder(name="sub_ob_%i"%x, dtype=tf.float32,
                shape=[None] + list(self.ob_space.shape)) for x in range(num_subpolicies)]
        self.sub_acs = [sub_policies[0].pdtype.sample_placeholder([None]) 
                for _ in range(num_subpolicies)]
        self.sub_atargs = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_subpolicies)]
        self.sub_ret = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_subpolicies)]
        self.logpacs = [tf.placeholder(dtype=tf.float32, shape=[num_subpolicies, None])
                for _ in range(num_subpolicies)]
        self.loss_masks = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_subpolicies)]
        if recurrent:
            self.sub_obs = [U.get_placeholder(name="sub_ob_%i"%x, dtype=tf.float32,
                shape=[nbatch] + list(self.ob_space.shape)) for x in range(num_subpolicies)]
            self.sub_masks = [U.get_placeholder(name="masks_%i"%_, dtype=tf.float32, 
                shape=[nbatch]) for _ in range(num_subpolicies)]
            self.sub_states = [U.get_placeholder(name="states_%i"%_, dtype=tf.float32, 
                shape=[envsperbatch, 2*nlstm]) for _ in range(num_subpolicies)]
        sub_retvals = zip(*[self.policy_loss(sub_policies[i], 
            old_sub_policies[i], self.sub_obs[i], self.sub_acs[i], self.sub_atargs[i], 
            self.sub_ret[i], clip_param, mask=self.loss_masks[i], vfcoeff=vfcoeff, 
            entcoeff=entcoeff, divcoeff=divcoeff, logpacs=None)#self.logpacs[i]) 
            for i in range(num_subpolicies)])
        self.sub_losses, self.sub_kl, self.sub_pol_surr, self.sub_vf_loss, \
                self.sub_entropy, self.sub_values, self.div_loss = sub_retvals 

        sub_trainers = [tf.train.AdamOptimizer(learning_rate=sub_lr)
                for _ in range(num_subpolicies)]
        sub_params = [sub_policies[i].get_trainable_variables() 
                for i in range(num_subpolicies)] 
        sub_grads = [tf.gradients(self.sub_losses[i], sub_params[i])
                for i in range(num_subpolicies)]
        sub_grads = [list(zip(g, p)) for g, p in zip(sub_grads, sub_params)]
        # TODO: gradient clipping
        self.subs_assign_old_eq_new = [U.function([],[], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(old_sub_policies[i].get_variables(), 
                sub_policies[i].get_variables())]) for i in range(num_subpolicies)]
        self.sub_train_steps = [sub_trainers[i].apply_gradients(sub_grads[i])
                for i in range(num_subpolicies)]

        U.initialize()
    def __init__(self,
                 env,
                 sub_policy,
                 old_sub_policy,
                 comm,
                 clip_param=0.2,
                 entcoeff=0,
                 optim_epochs=10,
                 optim_stepsize=3e-4,
                 optim_batchsize=64,
                 args=None):
        # self.policy = policy
        self.clip_param = clip_param
        self.entcoeff = entcoeff
        self.optim_epochs = optim_epochs
        self.optim_stepsize = optim_stepsize
        self.optim_batchsize = optim_batchsize
        # self.num_subpolicies = len(sub_policies)
        self.sub_policy = sub_policy
        self.args = args
        ob_space = env.observation_space
        ac_space = env.action_space

        # for training theta
        # inputs for training theta
        ob = U.get_placeholder_cached(name="ob")
        # ac = policy.pdtype.sample_placeholder([None])
        atarg = tf.placeholder(
            dtype=tf.float32,
            shape=[None])  # Target advantage function (if applicable)
        ret = tf.placeholder(dtype=tf.float32,
                             shape=[None])  # Empirical return
        entcoeff = tf.placeholder(dtype=tf.float32, name="entcoef")
        # total_loss = self.policy_loss(policy, old_policy, ob, ac, atarg, ret, clip_param, entcoeff)
        # self.master_policy_var_list = policy.get_trainable_variables()
        # self.master_loss = U.function([ob, ac, atarg, ret, entcoeff], U.flatgrad(total_loss, self.master_policy_var_list))
        # self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm)

        # self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        #     for (oldv, newv) in zipsame(old_policy.get_variables(), policy.get_variables())])

        self.assign_subs = []
        self.change_subs = []
        self.adams = []
        self.losses = []
        self.sp_ac = sub_policy.pdtype.sample_placeholder([None])
        # for i in range(self.num_subpolicies):
        varlist = sub_policy.get_trainable_variables()
        self.adams.append(MpiAdam(varlist))
        # loss for test
        loss = self.policy_loss(sub_policy, old_sub_policy, ob, self.sp_ac,
                                atarg, ret, clip_param, entcoeff)
        self.losses.append(
            U.function([ob, self.sp_ac, atarg, ret, entcoeff],
                       U.flatgrad(loss, varlist)))

        self.assign_subs.append(
            U.function(
                [], [],
                updates=[
                    tf.assign(oldv, newv)
                    for (oldv, newv) in zipsame(old_sub_policy.get_variables(),
                                                sub_policy.get_variables())
                ]))
        self.zerograd = U.function([], self.nograd(varlist))

        U.initialize()

        # self.master_adam.sync()
        # for i in range(self.num_subpolicies):
        self.adams[0].sync()
Exemple #13
0
    def __init__(self,
                 name,
                 ob,
                 ac_space,
                 hid_size,
                 num_hid_layers,
                 num_subpolicies,
                 gaussian_fixed_var=True):
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.num_subpolicies = num_subpolicies
        self.gaussian_fixed_var = gaussian_fixed_var
        self.num_subpolicies = num_subpolicies

        with tf.variable_scope(name):
            self.scope = tf.get_variable_scope().name
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=(ob.get_shape()[1], ))
            # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            obz = ob / 255.0

            # value function
            last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            '''Conv2d'''
            last_out = tf.nn.relu(
                U.conv2d(last_out, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            last_out = tf.nn.relu(
                U.conv2d(last_out, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            last_out = tf.nn.relu(
                U.conv2d(last_out, 32, "l3", [3, 3], [1, 1], pad="VALID"))
            last_out = U.flattenallbut0(last_out)
            last_out = tf.nn.relu(
                tf.layers.dense(last_out,
                                512,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))

            self.vpred = U.dense(last_out,
                                 1,
                                 "vffinal",
                                 weight_init=U.normc_initializer(1.0))[:, 0]

            # master policy
            # last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(U.dense(last_out, hid_size, "masterpol%i"%(i+1), weight_init=U.normc_initializer(1.0)))
            self.selector = U.dense(last_out,
                                    num_subpolicies, "masterpol_final",
                                    U.normc_initializer(0.01))
            self.pdtype = pdtype = CategoricalPdType(num_subpolicies)
            self.pd = pdtype.pdfromflat(self.selector)

        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])

        # debug
        self._debug = U.function([stochastic, ob], [ac, self.selector])
        self._act_forced = U.function([stochastic, ob, self.selector],
                                      [ac, self.vpred])