Ejemplo n.º 1
0
    def _init(self, ob_space, ac_space):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        obscaled = ob / 255.0

        with tf.variable_scope("pol"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)
        with tf.variable_scope("vf"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))
            self.vpredz = self.vpred

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Ejemplo n.º 2
0
    def _init(self, ob_space, ac_space, kind):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        x = ob / 255.0
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Ejemplo n.º 3
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        self.pdtype = make_pdtype(ac_space)
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X, **conv_kwargs)
            vf = fc(h, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 4
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
        ob_shape = (nbatch,) + ob_space.shape
        self.pdtype = make_pdtype(ac_space)
        X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            flatten = tf.layers.flatten
            pi_h1 = activ(fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            vf_h1 = activ(fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:,0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)


        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 5
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Ejemplo n.º 6
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
            processed_x = tf.layers.flatten(processed_x)
            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:,0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)


        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 7
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 8
0
    def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        vf_latent = vf_latent if vf_latent is not None else latent

        vf_latent = tf.layers.flatten(vf_latent)
        latent = tf.layers.flatten(latent)

        # Based on the action space, will select what probability distribution type
        self.pdtype = make_pdtype(env.action_space)

        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)

        # Take an action
        self.action = self.pd.sample()

        # Calculate the neg log of our probability
        self.neglogp = self.pd.neglogp(self.action)
        self.sess = sess or tf.get_default_session()

        if estimate_q:
            assert isinstance(env.action_space, gym.spaces.Discrete)
            self.q = fc(vf_latent, 'q', env.action_space.n)
            self.vf = self.q
        else:
            self.vf = fc(vf_latent, 'vf', 1)
            self.vf = self.vf[:,0]
Ejemplo n.º 9
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 10
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 11
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
        ob_shape = (nbatch,) + ob_space.shape
        actdim = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            pi = fc(h2, 'pi', actdim, init_scale=0.01)
            h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(h2, 'vf', 1)[:,0]
            logstd = tf.get_variable(name="logstd", shape=[1, actdim],
                initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, obs_name='ob',\
              obrms=True, final_std=0.01, init_logstd=0.0, observation_permutation=None,action_permutation=None, soft_mirror=False):
        assert isinstance(ob_space, gym.spaces.Box)

        obs_perm_mat = np.zeros(
            (len(observation_permutation), len(observation_permutation)),
            dtype=np.float32)
        self.obs_perm_mat = obs_perm_mat
        for i, perm in enumerate(observation_permutation):
            obs_perm_mat[i][int(np.abs(perm))] = np.sign(perm)

        if isinstance(ac_space, gym.spaces.Box):
            act_perm_mat = np.zeros(
                (len(action_permutation), len(action_permutation)),
                dtype=np.float32)
            self.act_perm_mat = act_perm_mat
            for i, perm in enumerate(action_permutation):
                self.act_perm_mat[i][int(np.abs(perm))] = np.sign(perm)
        elif isinstance(ac_space, gym.spaces.MultiDiscrete):
            total_dim = int(np.sum(ac_space.nvec))
            dim_index = np.concatenate([[0], np.cumsum(ac_space.nvec)])
            act_perm_mat = np.zeros((total_dim, total_dim), dtype=np.float32)
            self.act_perm_mat = act_perm_mat
            for i, perm in enumerate(action_permutation):
                perm_mat = np.identity(ac_space.nvec[i])
                if np.sign(perm) < 0:
                    perm_mat = np.flipud(perm_mat)
                    self.act_perm_mat[
                        dim_index[i]:dim_index[i] + ac_space.nvec[i],
                        dim_index[int(np.abs(perm)
                                      )]:dim_index[int(np.abs(perm))] +
                        ac_space.nvec[int(np.abs(perm))]] = perm_mat

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        print(self.pdtype)
        print([sequence_length] + list(ob_space.shape))
        ob = U.get_placeholder(name=obs_name,
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        mirror_ob = tf.matmul(ob, obs_perm_mat)
        mirror_obz = tf.clip_by_value(
            (mirror_ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        if not obrms:
            obz = ob
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "vffc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0)))
        self.vpred = dense(last_out,
                           1,
                           "vffinal",
                           weight_init=U.normc_initializer(1.0))[:, 0]

        if isinstance(ac_space, gym.spaces.Box):
            pol_net = GenericFF('pol_net', ob_space.shape[0], [],
                                pdtype.param_shape()[0] // 2, hid_size,
                                num_hid_layers)
        elif isinstance(ac_space, gym.spaces.MultiDiscrete):
            pol_net = GenericFF('pol_net', ob_space.shape[0], [],
                                pdtype.param_shape()[0], hid_size,
                                num_hid_layers)

        orig_out = pol_net.get_output_tensor(obz, None, tf.nn.tanh)
        mirr_out = tf.matmul(
            pol_net.get_output_tensor(mirror_obz, None, tf.nn.tanh),
            act_perm_mat)
        if not soft_mirror:
            mean = orig_out + mirr_out
        else:
            mean = orig_out
            self.additional_loss = tf.reduce_mean(
                tf.abs(orig_out - mirr_out)) * 1.0

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            logstd = tf.get_variable(
                name="logstd",
                shape=[1, pdtype.param_shape()[0] // 2],
                initializer=tf.constant_initializer(init_logstd))
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = mean

        self.pd = pdtype.pdfromflat(pdparam)
        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        self._act = U.function([stochastic, ob], [ac, self.vpred])
Ejemplo n.º 13
0
    def __init__(self,
                 ac_space,
                 X,
                 hidden_size,
                 n_layers=2,
                 activation="tanh",
                 value_baseline=False,
                 scope='MlpPolicy',
                 reuse=False,
                 X_placeholder=None,
                 fix_variance=False,
                 init_logstd=None):
        """
        Gaussian Policy. The variance is learned as parameters. You can also pass in the logstd from the outside.

            __init__: Construct the graph for the MLP policy.

        :param ac_space: action space, one of `gym.spaces.Box`
        :param X: Tensor or input placeholder for the observation
            :param hidden_size: size of hidden layers in network
        :param activation: one of 'reLU', 'tanh'
        :param scope: str, name of variable scope.
        :param reuse:
        :param value_baseline: bool flag whether compute a value baseline
        :param X_placeholder:
        :param fix_variance:
        :param init_logstd:
        """
        assert n_layers >= 2, f"hey, what's going on with this puny {n_layers}-layer network? " \
            f"--Ge (your friendly lab-mate)"
        if isinstance(scope, tf.VariableScope):
            self.scope_name = scope.name
        else:
            self.scope_name = scope
        self.name = (self.scope_name + "_reuse") if reuse else self.scope_name

        self.X_ph = X if X_placeholder is None else X_placeholder

        # done: this only applies to Discrete action space. Need to make more general.
        # now it works for both discrete action and gaussian policies.
        if isinstance(ac_space, spaces.Discrete):
            act_dim = ac_space.n
        else:
            act_dim, *_ = ac_space.shape

        if activation == 'tanh':
            act = tf.tanh
        elif activation == "relu":
            act = tf.nn.relu
        else:
            raise TypeError(f"{activation} is not available in this MLP.")
        with tf.variable_scope(scope, reuse=reuse):
            h_ = X
            for i in range(1, n_layers +
                           1):  # there is no off-by-one error here --Ge.
                h_ = fc(h_,
                        f'pi_fc_{i}',
                        nh=hidden_size,
                        init_scale=np.sqrt(2),
                        act=act)
                # a_ = fc(h_, f'pi_attn_{i}', nh=h_.shape[1], init_scale=np.sqrt(2), act=tf.math.sigmoid)
                # h_ = fc(h_ * a_, f'pi_fc_{i}', nh=hidden_size, init_scale=np.sqrt(2), act=act)
            mu = fc(h_, 'pi', act_dim, act=lambda x: x, init_scale=0.01)
            # _ = fc(h2, 'pi', act_dim, act=tf.tanh, init_scale=0.01)
            # mu = ac_space.low + 0.5 * (ac_space.high - ac_space.low) * (_ + 1)

            self.h_ = h_  # used for learned loss

            # assert (not G.vf_coef) ^ (G.baseline == "critic"), "These two can not be true or false at the same time."
            if value_baseline:
                # todo: conditionally declare these only when used
                # h1 = fc(X, 'vf_fc1', nh=hidden_size, init_scale=np.sqrt(2), act=act)
                # h2 = fc(h1, 'vf_fc2', nh=hidden_size, init_scale=np.sqrt(2), act=act)
                self.vf = fc(self.h_, 'vf', 1, act=lambda x: x)[:, 0]

            if isinstance(ac_space,
                          spaces.Box):  # gaussian policy requires logstd
                shape = tf.shape(mu)[0]
                if fix_variance:
                    _ = tf.ones(shape=[1, act_dim],
                                name="unit_logstd") * (init_logstd or 0)
                    logstd = tf.tile(_, [shape, 1])
                elif init_logstd is not None:
                    _ = tf.get_variable(
                        name="logstd",
                        shape=[1, act_dim],
                        initializer=tf.constant_initializer(init_logstd))
                    # todo: clip logstd to limit the range.
                    logstd = tf.tile(_, [shape, 1])
                else:
                    # use variance network when no initial logstd is given.
                    # _ = fc(X, 'logstd_fc1', nh=hidden_size, init_scale=np.sqrt(2), act=act)
                    # _ = fc(_, 'logstd_fc2', nh=hidden_size, init_scale=np.sqrt(2), act=act)

                    # note: this doesn't work. Really need to bound the variance.
                    # logstd = 1 + fc(self.h_, 'logstd', act_dim, act=lambda x: x, init_scale=0.01)
                    logstd = fc(self.h_,
                                'logstd',
                                act_dim,
                                act=lambda x: x,
                                init_scale=0.01)
                    # logstd = fc(self.h2, 'logstd', act_dim, act=tf.tanh, init_scale=0.01)
                    # logstd = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (logstd + 1)

                # GaussianPd takes 2 * [act_length] b/c of the logstd concatenation.
                ac = tf.concat([mu, logstd], axis=1)
                # A much simpler way is to multiply _logstd with a zero tensor shaped as mu.
                # [mu, mu * 0 + _logstd]
            else:
                raise NotImplemented(
                    'Discrete action space is not implemented!')

            # list of parameters is fixed at graph time.
            # todo: Only gets trainables that are newly created by the current policy function.
            # self.trainables = tf.trainable_variables()

            # placeholders = placeholders_from_variables(self.trainables)
            # self._assign_placeholder_dict = {t.name: p for t, p in zip(self.trainables, placeholders)}
            # self._assign_op = tf.group(*[v.assign(p) for v, p in zip(self.trainables, placeholders)])

        with tf.variable_scope("Gaussian_Action"):
            self.pdtype = make_pdtype(ac_space)
            self.pd = self.pdtype.pdfromflat(ac)

            self.a = a = self.pd.sample()
            self.mu = self.pd.mode()
            self.neglogpac = self.pd.neglogp(a)
Ejemplo n.º 14
0
    def __init__(self,
                 sess,
                 ob_space,
                 sensor_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False):  #pylint: disable=W0613
        ob_shape = (nbatch, ) + ob_space.shape
        ob_sensor_shape = (nbatch, ) + sensor_space.shape
        actdim = ac_space.shape[0]
        X_camera = tf.placeholder(tf.uint8, ob_shape, name='Ob_camera')  #obs
        X_sensor = tf.placeholder(tf.float32,
                                  ob_sensor_shape,
                                  name='Ob_sensor')

        self.pdtype = make_pdtype(ac_space)

        with tf.variable_scope("model", reuse=reuse):
            h_camera = conv(tf.cast(X_camera, tf.float32) / 255.,
                            'c1',
                            nf=32,
                            rf=8,
                            stride=4,
                            init_scale=np.sqrt(2))
            h2_camera = conv(h_camera,
                             'c2',
                             nf=64,
                             rf=4,
                             stride=2,
                             init_scale=np.sqrt(2))
            h3_camera = conv(h2_camera,
                             'c3',
                             nf=64,
                             rf=3,
                             stride=1,
                             init_scale=np.sqrt(2))
            h3_camera = conv_to_fc(h3_camera)
            h4_camera = fc(h3_camera, 'fc1', nh=512, init_scale=np.sqrt(2))
            pi_camera = fc(h4_camera, 'pi', actdim, init_scale=0.01)
            vf_camera = fc(h4_camera, 'v', 1)[:, 0]

        self.pd = self.pdtype.pdfromflat(pi_camera)

        with tf.variable_scope("model_sensor", reuse=reuse):
            h1_sensor = fc(X_sensor,
                           'pi_fc1',
                           nh=64,
                           init_scale=np.sqrt(2),
                           act=tf.tanh)
            h2_sensor = fc(h1_sensor,
                           'pi_fc2',
                           nh=64,
                           init_scale=np.sqrt(2),
                           act=tf.tanh)
            pi_sensor = fc(h2_sensor, 'pi', actdim, init_scale=0.01)
            h1_sensor = fc(X_sensor,
                           'vf_fc1',
                           nh=64,
                           init_scale=np.sqrt(2),
                           act=tf.tanh)
            h2_sensor = fc(h1_sensor,
                           'vf_fc2',
                           nh=64,
                           init_scale=np.sqrt(2),
                           act=tf.tanh)
            vf_sensor = fc(h2_sensor, 'vf', 1)[:, 0]

        with tf.variable_scope("model", reuse=reuse):
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, actdim],
                                     initializer=tf.zeros_initializer())
            X = tf.concat([X_camera, X_sensor], 0)
            pi_full = tf.concat([pi_camera, pi_sensor], 0)
            pi = fc(pi_full, 'pi', actdim, init_scale=0.01)
            vf_full = tf.concat([vf_camera, vf_sensor], 0)
            vf = fc(vf_full, 'vf', 1)[:, 0]

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
        self.pd = self.pdtype.pdfromflat(pdparam)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, ob_sensor, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {
                X_camera: ob,
                X_sensor: ob_sensor
            })
            return a, v, self.initial_state, neglogp

        def value(ob, ob_sensor, *_args, **_kwargs):
            return sess.run(vf, {X_camera: ob, X_sensor: ob_sensor})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 15
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm, **conv_kwargs): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        self.rep_loss = None
        # explicitly create  vector space for latent vectors
        latent_space = Box(-np.inf, np.inf, shape=(256,))
        # So that I can compute the saliency map
        if Config.REPLAY:
            X = tf.compat.v1.placeholder(shape=(nbatch,) + ob_space.shape, dtype=np.float32, name='Ob')
            processed_x = X
        else:
            X, processed_x = observation_input(ob_space, None)
            TRAIN_NUM_STEPS = Config.NUM_STEPS//16
            REP_PROC = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, 84, 84, 3), name='Rep_Proc')
            Z_INT = tf.compat.v1.placeholder(dtype=tf.int32, shape=(), name='Curr_Skill_idx')
            Z = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, Config.N_SKILLS), name='Curr_skill')
            CLUSTER_DIMS = 128
            HIDDEN_DIMS_SSL = 256
            self.protos = tf.compat.v1.Variable(initial_value=tf.random.normal(shape=(CLUSTER_DIMS, Config.N_SKILLS)), trainable=True, name='Prototypes')
            self.A = self.pdtype.sample_placeholder([None],name='A')
            # trajectories of length m, for N policy heads.
            self.STATE = tf.compat.v1.placeholder(tf.float32, [None,84,84,3], name='State')
            self.STATE_NCE = tf.compat.v1.placeholder(tf.float32, [Config.REP_LOSS_M,1,None,84,84,3], name='State_NCE')
            self.ANCH_NCE = tf.compat.v1.placeholder(tf.float32, [None,84,84,3], name='ANCH_NCE')
            # labels of Q value quantile bins
            self.LAB_NCE = tf.compat.v1.placeholder(tf.float32, [Config.POLICY_NHEADS,None], name='Labels')
            self.A_i = self.pdtype.sample_placeholder([None,Config.REP_LOSS_M,1],name='A_i')
            self.R_cluster = tf.compat.v1.placeholder(tf.float32, [None], name='R_cluster')
            # 1056
            self.A_cluster = self.pdtype.sample_placeholder([None], name='A_cluster')
            
        X = REP_PROC #tf.reshape(REP_PROC, [-1, 64, 64, 3])
        
        with tf.compat.v1.variable_scope("target", reuse=tf.compat.v1.AUTO_REUSE):
            act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn(REP_PROC)
            self.train_dropout_assign_ops = fast_dropout_assigned_ops
            self.run_dropout_assign_ops = slow_dropout_assign_ops
        with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE):
            self.h =  tf.concat([act_condit, act_invariant], axis=1)

        """
        Clustering part
        """

        N_ACTIONS = 5 if Config.ENVIRONMENT == 'ising' else 15
        with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE):
            # h_codes: n_batch x n_t x n_rkhs
            act_condit, act_invariant, _, _ = choose_cnn(REP_PROC)
            self.h_codes =  tf.transpose(tf.reshape(self.h,[-1,Config.NUM_ENVS,256]),(1,0,2))
            act_one_hot = tf.transpose(tf.reshape(self.A_cluster, [-1,17,ac_space.shape[0]]), (1,0,2))
            # tf.one_hot on action clusters gives (1056, 15)
            # after reshape it's (33, 32, 15)
            # after transpose it's  (32, 33, 15)
            # cts will be (32, 33, X)
            #act_one_hot = tf.transpose(tf.reshape(tf.one_hot(self.A_cluster,ac_space.n),[-1,Config.NUM_ENVS,ac_space.n]),(1,0,2))
            h_acc = []
            h_acc_no_act = []
            for k in range(Config.CLUSTER_T):
                h_t = self.h_codes[:,k:tf.shape(self.h_codes)[1]-(Config.CLUSTER_T-k-1)]
                self.dummy_ht = h_t
                # if k = 0 , T = 50, then 0: 49
                # TODO(Ahmed) Ask Bogdan what exactly this action subsampling/indexing is carried out
                a_t = act_one_hot[:,k:tf.shape(act_one_hot)[1]-(Config.CLUSTER_T-k-1) -1]

                #expand reshaping line by line for easier debugging
                h_t_reshaped = tf.reshape(h_t,(-1,256))
                a_t_reshaped = tf.reshape(a_t,(-1, ac_space.shape[0]))
                h_t_final = tf.expand_dims(tf.expand_dims(h_t_reshaped,1),1)[1:]
                # TODO(Ahmed) ensure that reshape from 256 to 255 doesn't break downstream
                h_t_film = h_t_final
                # h_t_film = tf.reshape(FiLM(widths=[128], name='FiLM_layer')([h_t_final, a_t_reshaped])[:,0,0],(Config.NUM_ENVS,-1,255))
                h_acc_no_act.append(tf.reshape(h_t,(Config.NUM_ENVS,-1,256)))
                h_acc.append(h_t_film)
            
            # h_seq_no_act = tf.reshape( tf.concat(h_acc_no_act,2), (-1,256*Config.CLUSTER_T))
            h_seq = tf.reshape( tf.concat(h_acc,2), (-1,256*Config.CLUSTER_T))
            self.h_seq = h_seq

            # self.z_t_no_act = get_online_predictor(n_in=256*Config.CLUSTER_T,n_out=CLUSTER_DIMS,prefix='SH_z_pred_no_act')(h_seq_no_act)
            self.z_t = get_online_predictor(n_in=256*Config.CLUSTER_T,n_out=CLUSTER_DIMS,prefix='SH_z_pred')(h_seq)

            self.u_t = get_predictor(n_in=CLUSTER_DIMS,n_out=CLUSTER_DIMS,prefix='SH_u_pred')(self.z_t)
            
        self.z_t_1 = self.z_t
        # scores: n_batch x n_clusters
        # tf.linalg.normalize(self.z_t_1, axis=1, ord='euclidean')[0]
        # tf.linalg.normalize(self.protos, axis=1, ord='euclidean')[0]
        scores = tf.linalg.matmul(tf.linalg.normalize(self.z_t_1, axis=1, ord='euclidean')[0],tf.linalg.normalize(self.protos, axis=1, ord='euclidean')[0])
        self.codes = sinkhorn(scores=scores)

        self.myow_loss = 0.
        if Config.MYOW:
            """
            MYOW where k-NN neighbors are replaced by Sinkhorn clusters
            """
            # with tf.compat.v1.variable_scope("random", reuse=tf.compat.v1.AUTO_REUSE):
            #     # h_codes: n_batch x n_t x n_rkhs
            #     act_condit_target, act_invariant_target, _, _ = choose_cnn(X)
            #     h_codes_target =  tf.transpose(tf.reshape(tf.concat([act_condit_target, act_invariant_target], axis=1),[-1,Config.NUM_ENVS,256]),(1,0,2))
            #     h_t_target = h_codes_target[:,:-1]
            #     h_tp1_target = h_codes_target[:,1:]
                
            #     # h_a_t = tf.transpose(tf.reshape(get_predictor(n_in=ac_space.n,n_out=256,prefix="SH_a_emb")( act_one_hot), (-1,Config.NUM_ENVS,256)), (1,0,2))
            #     h_seq_target = tf.reshape( tf.concat([h_t_target,h_tp1_target],2), (-1,256*Config.CLUSTER_T))
                # act_one_hot_target = tf.reshape(tf.one_hot(self.A_cluster,ac_space.n), (-1,ac_space.n))
                # h_seq_target = tf.squeeze(tf.squeeze(FiLM(widths=[512,512], name='FiLM_layer')([tf.expand_dims(tf.expand_dims(h_seq_target,1),1), act_one_hot_target]),1),1)
            y_online = h_seq
            y_target = tf.stop_gradient(h_seq)
            # y_reward = tf.reshape(self.R_cluster,(-1,1))
            

            
            # Find cluster adjacency scores
            dist = _compute_distance(tf.transpose(self.protos),tf.transpose(self.protos))
            
            k_t = Config.N_KNN
            vals, indx = tf.nn.top_k(-dist, k_t+1,sorted=True)

            cluster_idx = tf.cast(tf.argmax(scores,1),tf.int32)

            cluster_membership_list = []
            for i in range(Config.N_SKILLS):
                filter_ = tf.cast(tf.fill(tf.shape(cluster_idx), i),tf.int32)
                mask = tf.math.equal(filter_ , cluster_idx)
                cluster_vecs = tf.cast(tf.where(mask),tf.int32)
                cluster_vecs = tf.cond(tf.math.equal(tf.shape(cluster_vecs)[0],0),lambda :tf.constant([[0]],tf.int32),lambda :cluster_vecs)
                # cluster_idx = tf.cast(tf.round(tf.random.uniform((1,),maxval=tf.cast(tf.shape(cluster_vecs),tf.float32))[0]),tf.int32) # randomly sample a vector from its cluster
                cluster_membership_list.append(cluster_vecs[0]) # take first vector of this cluster as representative
            cluster_membership_list = tf.stack(cluster_membership_list)
            
            # import ipdb;ipdb.set_trace()
            
            # N_target = y_target
            with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE):
                v_online_net = get_predictor(n_in=256*Config.CLUSTER_T,n_out=HIDDEN_DIMS_SSL,prefix='MYOW_v_pred')
                r_online_net = get_predictor(n_in=HIDDEN_DIMS_SSL,n_out=HIDDEN_DIMS_SSL,prefix='MYOW_r_pred')
                v_online = v_online_net(y_online)
                r_online = r_online_net(v_online)
            with tf.compat.v1.variable_scope("target", reuse=tf.compat.v1.AUTO_REUSE):
                v_target_net = get_predictor(n_in=256*Config.CLUSTER_T,n_out=HIDDEN_DIMS_SSL,prefix='MYOW_v_pred')
                r_target_net = get_predictor(n_in=HIDDEN_DIMS_SSL,n_out=HIDDEN_DIMS_SSL,prefix='MYOW_r_pred')

            
            for k in range(k_t):
                nearby_cluster_idx = tf.gather(indx[:,k+1],cluster_idx)
                nearby_batch_vecs = tf.reshape(tf.gather(cluster_membership_list,tf.cast(nearby_cluster_idx,tf.int32)),(-1,))
                N_target = tf.gather(y_target, nearby_batch_vecs)
                v_target = v_target_net(N_target)
                # r_target = r_target_net(v_target)

                self.myow_loss += tf.reduce_mean(cos_loss(r_online, v_target)) #+ tf.reduce_mean(cos_loss(r_target, v_online))

            # with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE):
            #     phi_s = get_online_predictor(n_in=256,n_out=CLUSTER_DIMS,prefix='SH_z_pred')(tf.reshape(h_acc[-1],(-1,256)))
            #     self.myow_loss += tf.reduce_mean(cos_loss(phi_s, tf.transpose(tf.gather(self.protos,cluster_idx,axis=1),(1,0)) ))


        """
        Intrinsic rewards
        """
        with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE):
            self.R_I_SCALE = tf.nn.relu(get_linear_layer(n_in=256,n_out=1,prefix='r_i_scale',init=initializers.RandomNormal(stddev=0.11))(tf.reshape(tf.stop_gradient(h_acc[-1]),(-1,256))))

            # self.h = get_predictor(n_in=256+Config.N_SKILLS,n_out=256)(tf.concat([self.h,tf.stop_gradient(scores)],1))

        """
        Condition on soft-cluster assignments for policy head (Cluster Conditioned Policy )
        """
        if Config.CLUSTER_CONDIT_POLICY:
            concat_code = tf.stop_gradient(tf.reshape(self.codes, [-1, Config.N_SKILLS]))
            # print(self.h)
            # print(concat_code)
            #self.h = tf.concat([self.h, concat_code], axis=1)
            #h_seq = tf.squeeze(tf.squeeze(FiLM(widths=[512,512], name='FiLM_layer')([tf.expand_dims(tf.expand_dims(h_seq,1),1), act_one_hot]),1),1)

        with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE):
            if Config.CUSTOM_REP_LOSS and Config.POLICY_NHEADS > 1:
                self.pd_train = []
                for i in range(Config.POLICY_NHEADS):
                    with tf.compat.v1.variable_scope("head_"+str(i), reuse=tf.compat.v1.AUTO_REUSE):
                        self.pd_train.append(self.pdtype.pdfromlatent(self.h, init_scale=0.01)[0])
                with tf.compat.v1.variable_scope("head_i", reuse=tf.compat.v1.AUTO_REUSE):
                    self.pd_train_i = self.pdtype.pdfromlatent(self.h, init_scale=0.01)[0]
            else:
                with tf.compat.v1.variable_scope("head_0", reuse=tf.compat.v1.AUTO_REUSE):
                    self.pd_train = self.pdtype.pdfromlatent(self.h, init_scale=0.01)[0]
            
            if Config.CUSTOM_REP_LOSS and Config.POLICY_NHEADS > 1:
                # self.vf_train = [fc(self.h, 'v'+str(i), 1)[:, 0] for i in range(Config.POLICY_NHEADS)]
                self.vf_train = [fc(self.h, 'v_0', 1)[:, 0] ]
            else:
                self.dummy_vf_train = fc(self.h, 'v_0', 1)
                self.dummy_vf_train_curr = fc(self.h, 'v_0', 1)[:,0]
                self.vf_train = [fc(self.h, 'v_0', 1)[:, 0] ]
            self.vf_i_train = fc(tf.stop_gradient(self.h), 'v_i', 1)[:, 0]
            self.vf_i_run = self.vf_i_train

            # Plain Dropout version: Only fast updates / stochastic latent for VIB
            self.pd_run = self.pd_train
            self.vf_run = self.vf_train
            

            # For Dropout: Always change layer, so slow layer is never used
            self.run_dropout_assign_ops = []


        # Use the current head for classical PPO updates
        # normalize policies to (-1, 1) for DM control
        a0_run_pre_clamp = self.pd_run.sample()
        
        neglogp0_run_pre_clamp = self.pd_run.neglogp(a0_run_pre_clamp)
        
        # normalize policies to (-1, 1) for DM control
        a0_run = tf.math.tanh(a0_run_pre_clamp)
        # after applying tanh, rescale loglikehoods as well.
        # Assuming X ~ Normal() and Y = tanh(X) then log p(Y) = log p(X) - log dy / dx
        neglogp0_run = neglogp0_run_pre_clamp + tf.reduce_sum(tf.math.log( (1 - a0_run ** 2) + 1e-7),keepdims=True)
        #(2. * (np.log(2.) - a0_run_pre_clamp[idx] - tf.nn.softplus(-2. * a0_run_pre_clamp[idx])))
        self.initial_state = None

        def step(ob, update_frac, skill_idx=None, one_hot_skill=None, nce_dict = {},  *_args, **_kwargs):
            if Config.REPLAY:
                ob = ob.astype(np.float32)
            a, v, neglogp = sess.run([a0_run, self.vf_run, neglogp0_run], {REP_PROC: ob, Z: one_hot_skill})
            # sess.run([neglogp0_run[0]], {REP_PROC: ob})
            return a, v, 0., self.initial_state, neglogp
            

        def rep_vec(ob, *_args, **_kwargs):
            return sess.run(self.h, {X: ob})

        def value(ob, update_frac, one_hot_skill=None, *_args, **_kwargs):
            return sess.run(self.vf_run, {REP_PROC: ob, Z: one_hot_skill})

        def value_i(ob, update_frac, one_hot_skill=None, *_args, **_kwargs):
            return sess.run(self.vf_i_run, {REP_PROC: ob, Z: one_hot_skill})

        def nce_fw_pass(nce_dict):
            return sess.run([self.vf_i_run,self.rep_loss],nce_dict)

        def custom_train(ob, rep_vecs):
            return sess.run([self.rep_loss], {X: ob, REP_PROC: rep_vecs})[0]
        
        def compute_codes(ob,act):
            return sess.run([tf.reshape(self.codes , (Config.NUM_ENVS,Config.NUM_STEPS,-1)), tf.reshape(self.u_t , (Config.NUM_ENVS,Config.NUM_STEPS,-1)), tf.reshape(self.z_t_1 , (Config.NUM_ENVS,Config.NUM_STEPS,-1)) , self.h_codes[:,1:]], {REP_PROC: ob, self.A_cluster: act})
        
        def compute_hard_codes(ob):
            return sess.run([self.codes, self.u_t, self.z_t_1], {REP_PROC: ob})

        def compute_cluster_returns(returns):
            return sess.run([self.cluster_returns],{self.R_cluster:returns})

        self.X = X
        self.processed_x = processed_x
        self.step = step
        self.value = value
        self.value_i = value_i
        self.rep_vec = rep_vec
        self.custom_train = custom_train
        self.nce_fw_pass = nce_fw_pass
        self.encoder = choose_cnn
        self.REP_PROC = REP_PROC
        self.Z = Z
        self.compute_codes = compute_codes
        self.compute_hard_codes = compute_hard_codes
        self.compute_cluster_returns = compute_cluster_returns
Ejemplo n.º 16
0
def train(train, restore):

    # Initialize the environment
    env = make_mujoco_env("Reacher-v2", 0)

    # new session
    sess = tf.Session()

    pdtype = make_pdtype(env.action_space)

    # initialize teacher agent
    teacher = TeacherAgent(env, sess, True, batch=1)

    # This observation placeholder is for querying teacher action
    ob_ph = U.get_placeholder(name="ob",
                              dtype=tf.float32,
                              shape=[1, env.observation_space.shape[0]])

    with tf.variable_scope("LSTM"):

        # different from ob_ph, this tf placeholder holds a batch of observations for lstm training
        ob_batch_ph = tf.placeholder(
            name="ob_batch_ph",
            dtype=tf.float32,
            shape=[STEPS_UNROLLED, LSTM_BATCH_SIZE, OBSPACE_SHAPE])

        prev_pdflat_batch_ph = tf.placeholder(
            name="prev_pdflat_batch_ph",
            dtype=tf.float32,
            shape=[STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE])

        #prev_rew_batch_ph = tf.placeholder( name="prev_rew_batch_ph", dtype=tf.float32,
        #                  shape=[ STEPS_UNROLLED, MLP_BATCH_SIZE, 1 ] )

        keep_prob_ph = tf.placeholder(name="keep_prob_ph",
                                      dtype=tf.float32,
                                      shape=[])

        # ou#ter dim is 2 because of c_state and m state
        initial_state_batch_ph = tf.placeholder(
            shape=[2, LSTM_BATCH_SIZE, NUM_UNITS], dtype=tf.float32)

        # lstm graph; shape of s_pdflat_batch:[STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE]
        s_pdflat_batch, final_state_batch = student_lstm_graph(
            ob_batch_ph, keep_prob_ph, prev_pdflat_batch_ph,
            initial_state_batch_ph)

        t_pdflat_batch_ph = tf.placeholder(
            name="t_pdflat_batch_ph",
            shape=[STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE],
            dtype=tf.float32)

        # get student action wrt last observation
        # beginning at last index (i.e. STEPS_UNROLLED-1 ), sample down 1 element in the first (outer)  dimension
        # , and all elements in the inner dimensions
        s_pdflat_slice = tf.slice(s_pdflat_batch, [(STEPS_UNROLLED - 1),
                                                   (LSTM_BATCH_SIZE - 1), 0],
                                  [1, 1, -1])

        # for stepping
        s_action = pdtype.pdfromflat(s_pdflat_slice).mean

    # get a collection of students within the 'LSTM' scope for optimization
    student_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                    scope="LSTM")

    loss = kl_loss(s_pdflat_batch, t_pdflat_batch_ph, pdtype)

    with tf.name_scope("adam"):
        # adam optimizer for minimize kl loss; learning rate is fixed here
        adam = tf.train.AdamOptimizer(learning_rate=1e-3,
                                      beta1=0.9,
                                      beta2=0.999,
                                      epsilon=1e-8)

        minimize_adam = adam.minimize(loss, var_list=student_var)

    # initializer; to be placed at the very end
    init = tf.variables_initializer(
        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="LSTM"))

    # saver for restoring/saving depending on whether or not to train
    #saver = tf.train.Saver(
    #    var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='LSTM') )

    train_writer = tf.summary.FileWriter("/home/winstonww/reacher/data/viz/1")
    train_writer.add_graph(sess.graph)

    # state variables for lstm
    zero_state_batch = np.zeros([2, LSTM_BATCH_SIZE, NUM_UNITS])
    curr_state_batch = zero_state_batch

    with sess:

        # run initializer for adam optimizer
        sess.run(tf.variables_initializer(adam.variables()))

        # run initializer for lstm variables
        if not restore:
            sess.run(init)
        #elif glob.glob( lstm_trained_data_path + "*" ):
        #    saver.restore( sess, lstm_trained_data_path )
        else:
            print("attempt to restore trained data but {0} does not exist".
                  format(lstm_trained_data_path))

        dataset = Dataset(dir_path=dataset_path)
        # reset env
        ob = env.reset()

        reward = 0

        if train:
            # in this loop we accumulate enough teacher data to get us started
            print("Begin Training! First Accumulate observation with teacher")

            while dataset.num_episodes() <= LSTM_BATCH_SIZE * 2:

                # accumulate observations and teacher action data
                t_mean, t_pdflat = sess.run(
                    (teacher.pi.pd.mean, teacher.pi.pd.flat),
                    feed_dict={ob_ph: np.expand_dims(ob, axis=0)})

                dataset.write(ob=ob,
                              reward=reward,
                              t_pdflat=t_pdflat,
                              s_pdflat=np.zeros([PDFLAT_SHAPE]),
                              stepped_with='t')

                ob, reward, new, _ = env.step(t_mean)

                if new:
                    ob = env.reset()
                    dataset.flush()

            print("Accumulated sufficient data points from teacher. now train")

            while True:

                total_loss = 0
                s = zero_state_batch

                # BPTT
                print("BPTT")
                for (ob_batch_array, t_pdflat_batch_array,
                     prev_pdflat_batch_array,
                     prev_rew_batch_array) in dataset.training_batches():
                    # minimize loss to train student
                    l, s, _ = sess.run(
                        [loss, final_state_batch, minimize_adam],
                        feed_dict={
                            keep_prob_ph: KEEP_PROB,
                            ob_batch_ph: ob_batch_array,
                            # TODO: revert this back
                            #prev_pdflat_batch_ph: prev_pdflat_batch_array,
                            #prev_rew_batch_ph: prev_rew_batch_array,
                            #prev_pdflat_batch_ph: ob_batch_array,
                            #prev_pdflat_batch_ph: np.random.rand(STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE),
                            #prev_pdflat_batch_ph: np.zeros([STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE]),
                            t_pdflat_batch_ph: t_pdflat_batch_array,
                            initial_state_batch_ph: s
                        })
                    total_loss += l
                print("DONE")

                # Get Teacher action for the last observation
                new = None

                while not new:
                    t_pdflat = sess.run(
                        (teacher.pi.pd.flat),
                        feed_dict={ob_ph: np.expand_dims(ob, axis=0)})

                    ob_batch_array, prev_pdflat_batch_array, prev_rew_batch_array = dataset.test_batch(
                        ob)

                    # Get student action for the last ovservation
                    s_ac, s_pdflat, curr_state_batch = sess.run(
                        (s_action, s_pdflat_slice, final_state_batch),
                        feed_dict={
                            keep_prob_ph: 1,
                            ob_batch_ph: ob_batch_array,
                            #TODO: revert this back
                            #prev_pdflat_batch_ph: prev_pdflat_batch_array,
                            #prev_rew_batch_ph: prev_rew_batch_array,
                            #prev_pdflat_batch_ph: ob_batch_array,
                            #prev_pdflat_batch_ph: np.random.rand(STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE),
                            #prev_pdflat_batch_ph: np.zeros([STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE]),
                            initial_state_batch_ph: curr_state_batch
                        })

                    dataset.write(ob=ob,
                                  reward=reward,
                                  t_pdflat=t_pdflat,
                                  s_pdflat=s_pdflat,
                                  stepped_with='s')

                    # step with student
                    ob, reward, new, _ = env.step(s_ac)

                    if new:
                        print("************** Episode {0} ****************".
                              format(dataset.num_episodes()))
                        ob = env.reset()
                        print("recent loss: %f " % total_loss)
                        dataset.flush()
                        #save_path = saver.save(sess, lstm_trained_data_path )
                        if dataset.num_episodes() % MAX_CAPACITY == 0:
                            dataset.dump()
                        if dataset.num_episodes() == 5000: break
Ejemplo n.º 17
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Dict)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob_config = U.get_placeholder(name="ob",
                                      dtype=tf.float32,
                                      shape=[sequence_length] +
                                      list(ob_space.spaces['joint'].shape))
        ob_target = U.get_placeholder(name="goal",
                                      dtype=tf.float32,
                                      shape=[sequence_length] +
                                      list(ob_space.spaces['target'].shape))
        obs_pos = U.get_placeholder(
            name="obs_pos",
            dtype=tf.float32,
            shape=[sequence_length] +
            list(ob_space.spaces['obstacle_pos1'].shape))
        #is_training = U.get_placeholder(name="bn_training", dtype=tf.bool, shape=())
        # construct v function model
        '''with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space['joint'].shape)

        obz = tf.clip_by_value((ob_config - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        goal_last_out = tf.clip_by_value((ob_target - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)'''
        last_out = ob_config
        goal_last_out = ob_target
        obs_last_out = obs_pos
        for i in range(num_hid_layers):
            last_out = dense(last_out,
                             hid_size,
                             "vfcfc%i" % (i + 1),
                             weight_init=U.normc_initializer(1.0),
                             weight_loss_dict={})
            #last_out = tf.layers.batch_normalization(last_out, training=is_training, name="vfcbn%i"%(i+1))
            last_out = tf.nn.tanh(last_out)
            goal_last_out = dense(goal_last_out,
                                  hid_size,
                                  "vfgfc%i" % (i + 1),
                                  weight_init=U.normc_initializer(1.0),
                                  weight_loss_dict={})
            #goal_last_out = tf.layers.batch_normalization(goal_last_out, training=is_training, name="vfgbn%i" % (i + 1))
            goal_last_out = tf.nn.tanh(goal_last_out)
            obs_last_out = dense(obs_last_out,
                                 hid_size,
                                 "vfobsfc%i" % (i + 1),
                                 weight_init=U.normc_initializer(1.0),
                                 weight_loss_dict={})
            #obs_last_out = tf.layers.batch_normalization(obs_last_out, training=is_training, name="vfobn%i"%(i+1))
            obs_last_out = tf.nn.tanh(obs_last_out)
        vpred = tf.concat([last_out, goal_last_out, obs_last_out], -1)
        self.vpred = dense(vpred,
                           1,
                           "vffinal",
                           weight_init=U.normc_initializer(1.0))[:, 0]

        # construct policy probability distribution model
        last_out = ob_config
        goal_last_out = ob_target
        obs_last_out = obs_pos

        for i in range(num_hid_layers):
            last_out = dense(last_out,
                             hid_size,
                             "pol_cfc%i" % (i + 1),
                             weight_init=U.normc_initializer(1.0),
                             weight_loss_dict={})
            #last_out = tf.layers.batch_normalization(last_out, training=is_training, name="pol_cbn%i"%(i+1))
            last_out = tf.nn.tanh(last_out)
            goal_last_out = dense(goal_last_out,
                                  hid_size,
                                  "pol_gfc%i" % (i + 1),
                                  weight_init=U.normc_initializer(1.0),
                                  weight_loss_dict={})
            #goal_last_out = tf.layers.batch_normalization(goal_last_out, training=is_training, name="pol_gbn%i" % (i + 1))
            goal_last_out = tf.nn.tanh(goal_last_out)
            obs_last_out = dense(obs_last_out,
                                 hid_size,
                                 "pol_obsfc%i" % (i + 1),
                                 weight_init=U.normc_initializer(1.0),
                                 weight_loss_dict={})
            #obs_last_out = tf.layers.batch_normalization(obs_last_out, training=is_training, name="pol_obn%i"%(i+1))
            obs_last_out = tf.nn.tanh(obs_last_out)
        last_out = tf.concat([last_out, goal_last_out, obs_last_out], -1)
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense(last_out,
                         pdtype.param_shape()[0] // 2, "polfinal",
                         U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.constant_initializer(
                                         [0.2, 0.2, -1., -1.]))
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = dense(last_out,
                            pdtype.param_shape()[0], "polfinal",
                            U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob_config, ob_target, obs_pos],
                               [ac, self.vpred])
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        # define action and observation space
        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), )

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        obz_pure = tf.clip_by_value(
            (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) /
            self.ob_rms_only.std, -5.0, 5.0)

        # implement Q-function approximation
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.relu(
                U.dense(last_out0,
                        hid_size,
                        "vffc0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.relu(
                U.dense(last_out1,
                        hid_size,
                        "vffc1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "vfff0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "vfff1",
                            weight_init=U.normc_initializer(1.0))

        # return the Q-function value
        self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0]

        # implement parametrizatzion for policy over options
        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.relu(
                U.dense(last_out0,
                        hid_size,
                        "oppi0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.relu(
                U.dense(last_out1,
                        hid_size,
                        "oppi1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "oppif0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "oppif1",
                            weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)
        # return probabilities for the options
        self.op_pi = tf.nn.softmax(last_out)

        # always terminate
        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        termination_sample = tf.constant([True])

        # define the control policy / intra-option policy
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.relu(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01),
                            bias=False)
            # now also use relus to squash to -1,1
            mean = (-tf.nn.relu(-(mean - 1)) + tf.nn.relu(-(mean + 1))) + 1
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # sample stochastically -> this corresponds to exploration
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        # choose the appropriate action, apply the ZOH if using option 0
        ac = U.switch(option[0], ac,
                      tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        ac = tf.clip_by_value(ac, -1.0, 1.0)

        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])
Ejemplo n.º 19
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('pol'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
        # with tf.variable_scope('pol'):
        #     # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
        #     #                        -5.0, 5.0)
        #     last_out = ob
        #     for i in range(num_hid_layers):
        #         last_out = tf.nn.tanh(
        #             tf.layers.dense(last_out, hid_size, name='fc%i' % (i + 1),
        #                             kernel_initializer=U.normc_initializer(
        #                                 1.0), bias_initializer = tf.constant_initializer(0.1)))
        #     mean = tf.layers.dense(last_out, pdtype.param_shape()[0] // 2,
        #                                name='final',
        #                                kernel_initializer=U.normc_initializer(
        #                                    0.01))
        #     logstd = tf.get_variable(name="logstd", shape=[1,
        #                                                    pdtype.param_shape()[
        #                                                        0] // 2],
        #                              initializer=tf.zeros_initializer())
        #     # out_std = tf.exp(0.5*logstd + 0.0)
        #     # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        #     import numpy as np
        #     pdparam = tf.concat([mean, mean * 0.0 + np.random.randn(pdtype.param_shape()[0] // 2) * logstd], axis=1)
        #     # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
        #     #     mean = tf.layers.dense(last_out, pdtype.param_shape()[0] // 2,
        #     #                            name='final',
        #     #                            kernel_initializer=U.normc_initializer(
        #     #                                0.01))
        #     #     logstd = tf.get_variable(name="logstd", shape=[1,
        #     #                                                    pdtype.param_shape()[
        #     #                                                        0] // 2],
        #     #                              initializer=tf.zeros_initializer())
        #     #     out_std = tf.exp(0.5*logstd + 0.0)
        #     #     # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        #     #     import numpy as np
        #     #     pdparam = tf.concat([mean, np.random.randn(pdtype.param_shape()[0] // 2) * out_std], axis=1)
        #     # # else:
        #     # pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0],
        #     #                       name='final',
        #     #                       kernel_initializer=U.normc_initializer(
        #     #                           0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], ac)
Ejemplo n.º 20
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 scope="policy"):
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features. It might slow down the training."
            )
        self.layernormalize = layernormalize
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(ac_space)
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(None, None) + ob_space.shape,
                                        name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder([None, None],
                                                           name='ac')
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            pdparamsize = self.ac_pdtype.param_shape()[0]

            print('ob_mean shape: ', ob_mean.shape)
            sh = tf.shape(self.ph_ob)

            x = flatten_two_dims(self.ph_ob)
            x = tf.cast(x, dtype=tf.float32)
            l = []
            for i in range(4):
                r = tf.multiply(x[:, :, :, i * 3], 0.299)
                g = tf.multiply(x[:, :, :, i * 3 + 1], 0.587)
                b = tf.multiply(x[:, :, :, i * 3 + 2], 0.114)

                gray = r + g + b

                l.append(gray)

            x = tf.stack(l, axis=-1)
            x = tf.cast(x, dtype=tf.int32)

            l = []
            for i in range(4):
                r = ob_mean[:, :, i * 3] * 0.299
                g = ob_mean[:, :, i * 3 + 1] * 0.587
                b = ob_mean[:, :, i * 3 + 2] * 0.114

                gray = r + g + b

                l.append(gray)

            print('before obmean: ', self.ob_mean.shape)
            self.ob_mean = np.stack(l, axis=-1)
            self.ob_rgb_mean = ob_mean
            print('after obmean: ', self.ob_mean.shape)

            self.flat_features = self.get_features(x, reuse=False)
            self.features = unflatten_first_dim(self.flat_features, sh)

            with tf.variable_scope(scope, reuse=False):
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                pdparam = fc(x, name='pd', units=pdparamsize, activation=None)
                vpred = fc(x,
                           name='value_function_output',
                           units=1,
                           activation=None)
            pdparam = unflatten_first_dim(pdparam, sh)
            self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
Ejemplo n.º 21
0
    def _init(self, ob_space, ac_space,hid_size_V, hid_size_actor, num_hid_layers,V_keep_prob,\
             mc_samples,layer_norm,activation_critic,activation_actor, dropout_on_V,gaussian_fixed_var=True, sample_dropout=False):
        assert isinstance(ob_space, gym.spaces.Box)
        self.sample_dropout = sample_dropout

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        
        
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        
        
        
        self.mc_samples=mc_samples
        self.V_keep_prob=V_keep_prob
        
        ### MAIN CHANGES
        #######################
        # Value function  

      
        with tf.variable_scope("value_function"):
            
            dropout_networks = [last_out] * self.mc_samples
           # dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.V_keep_prob)
            
            for i in range(num_hid_layers):
                if layer_norm:
                    last_out = activation_critic(tc.layers.layer_norm(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \
                    kernel_initializer=U.normc_initializer(1.0)), center=True,scope="vffc_activation%i"%(i+1) ,scale=True))
                    
                    apply_layer = lambda x : activation_critic(tc.layers.layer_norm(tf.layers.dense(x, hid_size_V,name="vffc%i"%(i+1), 
                                        reuse=True) ,center=True,scope="vffc_activation%i"%(i+1) ,scale=True,reuse=True) )
                else:
                    last_out = activation_critic(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \
                    kernel_initializer=U.normc_initializer(1.0)))
                    
                    apply_layer = lambda x : activation_critic(tf.layers.dense(x, hid_size_V,name="vffc%i"%(i+1), 
                                        reuse=True))
               
                dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.V_keep_prob)
            
            ## final layer
            self.vpred = tf.layers.dense(last_out, 1, name="vffinal", kernel_initializer=U.normc_initializer(1.0))[:,0]
            
            apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \
                        name="vffinal", reuse=True)[:,0]
            dropout_networks=generate_layer(apply_layer,dropout_networks,self.V_keep_prob)
            
            mean,variance=tf.nn.moments(tf.stack(dropout_networks), 0)
            
            self.vpred_mc_mean=tf.add_n(dropout_networks) / float(len(dropout_networks))
            self.vpred_dropout_networks=dropout_networks
            
            self.variance=variance
            LAMBDA = tf.placeholder(dtype=tf.float32, shape=())
            self.v_lambda_variance=self.vpred_mc_mean+LAMBDA*tf.sqrt(variance)
         
            

            
        #######################    
        ## Policy
        last_out = obz
      
        with tf.variable_scope("policy"):
            for i in range(num_hid_layers):
                
                last_out = U.dense(last_out, hid_size_actor, "polfc%i"%(i+1), \
                weight_init=U.normc_initializer(1.0)) 
                last_out = activation_actor(last_out)
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
       
        
        
        last_out = obz
        
        ## BUilding function Q(s,a)
        
#        last_out2=self.pd.sample()
#        activation=tf.nn.relu
#        #######################
#        # Action Value function  
#        with tf.variable_scope("Q"):        
#            dropout_networks = [last_out] * self.mc_samples
#            dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.keep_prob)
#                 
#            ## concatenate state and action
#            last_out = tf.concat([last_out, last_out2], axis=-1)
#            
#            new_networks = []
#            for dropout_network in dropout_networks:
#                dropout_network = tf.concat([dropout_network, last_out2], axis=-1)
#                dropout_network, mask = U.bayes_dropout(dropout_network, self.keep_prob)
#                new_networks.append(dropout_network)
#            dropout_networks = new_networks
#            
#            ### hidden layers
#            for i in range(num_hid_layers):
#                
#                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="Q%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
#                apply_layer = lambda x : activation(tf.layers.dense(x, hid_size, activation=None, \
#                        name="Q%i"%(i+1), reuse=True))
#                dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.keep_prob)
#            
#            ## final layer
#            self.qpred = tf.layers.dense(last_out, 1, name="Qfinal", kernel_initializer=U.normc_initializer(1.0))[:,0]
#            
#            apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \
#                        name="Qfinal", reuse=True)[:,0]
#            dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.keep_prob)
#            
#            self.qpred_mc_mean=tf.add_n(dropout_networks) / float(len(dropout_networks))
#            self.qpred_dropout_networks=dropout_networks
        
        
        
        
        ### MAIN CHANGES
        ## if dropout:
        if dropout_on_V:
            if self.sample_dropout:
                self._act = [U.function([stochastic, ob], [ac, x]) for x in dropout_networks]
            else:
                self._act = U.function([stochastic, ob], [ac, self.vpred_mc_mean])
                       


            
        else:
            self._act = U.function([stochastic, ob], [ac, self.vpred])
Ejemplo n.º 22
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, init_std=1.0, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        self.varphi_dim = hid_size

        self.ob = utils.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        # self.ob = tf.placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=utils.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=utils.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            # Create 'num_hid_layers' hidden layers
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=utils.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                self.action_dim = ac_space.shape[0]

                # mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                # logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)

                self.dist_diagonal = True
                self.varphi = last_out
                self.varphi_dim = hid_size

                if self.dist_diagonal:
                    stddev_init = np.ones([1, self.action_dim]) * init_std
                    prec_init = 1. / (np.multiply(stddev_init, stddev_init))  # 1 x |a|
                    self.prec = tf.get_variable(name="prec", shape=[1, self.action_dim],
                                                initializer=tf.constant_initializer(prec_init))
                    kt_init = np.ones([self.varphi_dim, self.action_dim]) * 0.5 / self.varphi_dim
                    ktprec_init = kt_init * prec_init
                    self.ktprec = tf.get_variable(name="ktprec", shape=[self.varphi_dim, self.action_dim],
                                                  initializer=tf.constant_initializer(ktprec_init))
                    kt = tf.divide(self.ktprec, self.prec)
                    mean = tf.matmul(last_out, kt)

                    logstd = tf.log(tf.sqrt(1. / self.prec))
                else:
                    # Not implemented yet
                    raise NotImplementedError

                self.prec_get_flat = utils.GetFlat([self.prec])
                self.prec_set_from_flat = utils.SetFromFlat([self.prec])

                self.ktprec_get_flat = utils.GetFlat([self.ktprec])
                self.ktprec_set_from_flat = utils.SetFromFlat([self.ktprec])

                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=utils.normc_initializer(0.01))

        self.scope = tf.get_variable_scope().name

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = utils.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = utils.function([stochastic, self.ob], [ac, self.vpred])

        # Get all policy parameters
        vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope + '/pol')
        # Remove log-linear parameters ktprec and prec to get only non-linear parameters
        del vars[-1]
        del vars[-1]
        beta_params = vars

        # Flat w_beta
        beta_len = np.sum([np.prod(p.get_shape().as_list()) for p in beta_params])
        w_beta_var = tf.placeholder(dtype=tf.float32, shape=[beta_len])

        # Unflatten w_beta
        beta_shapes = list(map(tf.shape, beta_params))
        w_beta_unflat_var = self.unflatten_tensor_variables(w_beta_var, beta_shapes)

        # w_beta^T * \grad_beta \varphi(s)^T
        v = tf.placeholder(dtype=self.varphi.dtype, shape=self.varphi.get_shape(), name="v_in_Rop")
        features_beta = self.alternative_Rop(self.varphi, beta_params, w_beta_unflat_var, v)

        self.features_beta = utils.function([self.ob, w_beta_var, v], features_beta)
Ejemplo n.º 23
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=True):  #pylint: disable=W0613
        ob_shape = (nbatch, ) + ob_space.shape
        actdim = ac_space.shape[0]

        window_length = ob_space.shape[1] - 1

        X = tf.placeholder(tf.float32, ob_shape, name='Ob')  #obs

        #         with tf.variable_scope("model", reuse=reuse) as scope:

        #             # policy
        #             w0 = tf.slice(X, [0,0,0,0],[-1,-1,1,1], name='pi_sl0')
        #             x = tf.slice(X, [0,0,1,0],[-1,-1,-1,-1], name='pi_sl1')
        #             x = conv(tf.cast(x, tf.float32),'c1', fh=1,fw=4,nf=3, stride=1, init_scale=np.sqrt(2))
        #             # x = tf.layers.conv2d(
        #             #     inputs=x,
        #             #     filters=3,
        #             #     kernel_size=[1, 4],
        #             #     padding="valid",
        #             #     activation=tf.nn.relu)
        #             #(1, 3, 47, 3)

        #             x = conv(x, 'c2', fh=1, fw=window_length -3, nf=20, stride= window_length -3, init_scale=np.sqrt(2))
        #             # x = tf.layers.conv2d(
        #             #     inputs=x,
        #             #     filters=20,
        #             #     kernel_size=[1, window_length -3],
        #             #     padding="valid",
        #             #     strides=(1, window_length -3),
        #             #     activation=tf.nn.relu)

        #             x = tf.concat([x, w0], 3)

        #             x = conv(x, 'c3', fh=1, fw=1, nf=1, stride= 1, init_scale=np.sqrt(2))
        #             # x = tf.layers.conv2d(
        #             #     inputs=x,
        #             #     filters=1,
        #             #     kernel_size=[1, 1],
        #             #     padding="valid",
        #             #     strides=(1, 1),
        #             #     activation=tf.nn.relu)

        #             cash_bias = tf.zeros([x.shape[0],1,1,1], tf.float32)
        #             c = tf.concat([cash_bias, x], 1)

        #             v = conv_to_fc(x)

        #             # vf = fc(v, 'v',1)[:,0]

        #             f = tf.contrib.layers.flatten(c)
        #             eps = 10e20
        #             f = tf.clip_by_value(f, -eps, eps, 'clip1')
        #             # f = tf.Print(f, [f], "concatenate")
        #             pi = tf.nn.softmax(f)
        #             # pi = tf.Print(pi,[pi], 'pi ')

        #             # f = tf.nn.relu(f)
        #             vf = fc(v, 'v',1, act=tf.nn.relu)[:,0]

        #             # vf = tf.add(tf.ones(v.shape), v)

        #             # vf = fc(v, 'v',1)[:,0]

        #             # vf = tf.add(vf, tf.ones(vf.shape, tf.float32))

        #             logstd = tf.get_variable(name="logstd", shape=[1, actdim],
        #                 initializer=tf.zeros_initializer())
        #             eps = 80
        #             logstd = tf.clip_by_value(logstd, -eps, eps, 'clip_logstd')
        #             # logstd = tf.Print(logstd,[logstd], 'logstd ')
        with tf.variable_scope("model", reuse=reuse) as scope:
            w0 = tf.slice(X, [0, 0, 0, 0], [-1, -1, 1, 1])
            x = tf.slice(X, [0, 0, 1, 0], [-1, -1, -1, -1])

            # reuse when testing

            x = conv(tf.cast(x, tf.float32),
                     'c1',
                     fh=1,
                     fw=3,
                     nf=3,
                     stride=1,
                     init_scale=np.sqrt(2))

            x = conv(x,
                     'c2',
                     fh=1,
                     fw=window_length - 2,
                     nf=20,
                     stride=window_length - 2,
                     init_scale=np.sqrt(2))

            x = tf.concat([x, w0], 3)

            x = conv(x,
                     'c3',
                     fh=1,
                     fw=1,
                     nf=1,
                     stride=1,
                     init_scale=np.sqrt(2))

            cash_bias = tf.ones([x.shape[0], 1, 1, 1], tf.float32)
            c = tf.concat([cash_bias, x], 1)

            v = conv_to_fc(x)
            vf = fc(v, 'v', 1)[:, 0]

            f = tf.contrib.layers.flatten(c)

            pi = tf.nn.softmax(f)

            logstd = tf.get_variable(
                name="logstd",
                shape=[1, actdim],
                initializer=tf.truncated_normal_initializer())
            # logstd = tf.Print(logstd,[logstd], 'logstd ')
            eps = 50
            # logstd = tf.clip_by_value(logstd, -eps, eps, 'clip_logstd')

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        a0 = self.pd.sample()
        # a0 = tf.clip_by_value(a0, -eps, eps, 'clip2')
        a0 = tf.nn.softmax(a0)

        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp, lst, p = sess.run([a0, vf, neglogp0, logstd, pi],
                                             {X: ob})

            # print ("logstd: "+ str(lst[0]))

            # print ("action: " + str(a))
            # print ("value: {}".format(v))
            # print ("neglogp: "+ str(neglogp))
            # print ("f:{}".format(f))
            return a, v, self.initial_state, neglogp, lst[0], p

        def value(ob, *_args, **_kwargs):
            v = sess.run(vf, {X: ob})
            # print ("vf: " + str(v))
            return v

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 24
0
    def __init__(self, sess, ob_space, action_space, nbatch, nsteps, reuse = False):
        # This will use to initialize our kernels
        gain = np.sqrt(2)

        # Based on the action space, will select what probability distribution type
        # we will use to distribute action in our stochastic policy (in our case DiagGaussianPdType
        # aka Diagonal Gaussian, 3D normal distribution
        self.pdtype = make_pdtype(action_space)

        height, weight, channel = ob_space.shape
        ob_shape = (height, weight, channel)

        # Create the input placeholder
        inputs_ = tf.placeholder(tf.float32, [None, *ob_shape], name="input")

        # Normalize the images
        scaled_images = tf.cast(inputs_, tf.float32) / 255.

        """
        Build the model
        3 CNN for spatial dependencies
        Temporal dependencies is handle by stacking frames
        (Something funny nobody use LSTM in OpenAI Retro contest)
        1 common FC
        1 FC for policy
        1 FC for value
        """
        with tf.variable_scope("model", reuse = reuse):
            conv1 = conv_layer(scaled_images, 32, 8, 4, gain)
            conv2 = conv_layer(conv1, 64, 4, 2, gain)
            conv3 = conv_layer(conv2, 64, 3, 1, gain)
            flatten1 = tf.layers.flatten(conv3)
            fc_common = fc_layer(flatten1, 512, gain=gain)

            # This build a fc connected layer that returns a probability distribution
            # over actions (self.pd) and our pi logits (self.pi).
            self.pd, self.pi = self.pdtype.pdfromlatent(fc_common, init_scale=0.01)

            # Calculate the v(s)
            vf = fc_layer(fc_common, 1, activation_fn=None)[:, 0]

        self.initial_state = None

        # Take an action in the action distribution (remember we are in a situation
        # of stochastic policy so we don't always take the action with the highest probability
        # for instance if we have 2 actions 0.7 and 0.3 we have 30% chance to take the second)
        a0 = self.pd.sample()

        # Function use to take a step returns action to take and V(s)
        def step(state_in, *_args, **_kwargs):
            action, value = sess.run([a0, vf], {inputs_: state_in})
           
            #print("step", action)
            
            return action, value

        # Function that calculates only the V(s)
        def value(state_in, *_args, **_kwargs):
            return sess.run(vf, {inputs_: state_in})

        # Function that output only the action to take
        def select_action(state_in, *_args, **_kwargs):
            return sess.run(a0, {inputs_: state_in})

        self.inputs_ = inputs_
        self.vf = vf
        self.step = step
        self.value = value
        self.select_action = select_action
Ejemplo n.º 25
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):  #pylint: disable=W0613
        ob_shape = (nbatch, ) + ob_space.shape
        # Discrete
        actdim = ac_space.n
        with tf.compat.v1.variable_scope('policy', reuse=reuse):
            X = tf.compat.v1.placeholder(tf.float32, ob_shape, name='Ob')  #obs
            activ = tf.tanh
            # logstd = tf.Variable(name="logstd", shape=[1, actdim], initial_value=tf.zeros([1, actdim]))
            h1 = activ(fc(X, 'v_mix_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'v_mix_fc2', nh=64, init_scale=np.sqrt(2)))
            v_mix0 = fc(h2, 'v_mix', 1)[:, 0]
            h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            pi = fc(h2, 'pi', actdim, init_scale=0.01)
        with tf.compat.v1.variable_scope('intrinsic', reuse=reuse):
            X_ALL = tf.compat.v1.placeholder(tf.float32,
                                             (None, ) + ob_space.shape,
                                             name='Ob_all')  #obs
            A_ALL = tf.compat.v1.placeholder(tf.float32, [None, actdim],
                                             name='Ac_all')  #obs
            INPUT = tf.concat([X_ALL, A_ALL], axis=1)
            activ = tf.tanh
            h1 = activ(fc(INPUT, 'intrinsic_fc1', nh=64,
                          init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'intrinsic_fc2', nh=64, init_scale=np.sqrt(2)))
            r_in0 = tf.tanh(fc(h2, 'r_in', 1))[:, 0]
            h1 = activ(fc(X, 'v_ex_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'v_ex_fc2', nh=64, init_scale=np.sqrt(2)))
            v_ex0 = fc(h2, 'v_ex', 1)[:, 0]

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v_ex, v_mix, neglogp = sess.run([a0, v_ex0, v_mix0, neglogp0],
                                               {X: ob})
            return a, v_ex, v_mix, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            v_ex, v_mix = sess.run([v_ex0, v_mix0], {X: ob})
            return v_ex, v_mix

        def intrinsic_reward(ob, ac, *_args, **_kwargs):
            r_in = sess.run(r_in0, {X_ALL: ob, A_ALL: ac})
            return r_in

        self.X = X
        self.X_ALL = X_ALL
        self.A_ALL = A_ALL
        self.pi = pi
        self.v_ex = v_ex0
        self.r_in = r_in0
        self.v_mix = v_mix0
        self.step = step
        self.value = value
        self.intrinsic_reward = intrinsic_reward
        self.policy_params = tf.compat.v1.trainable_variables("policy")
        self.intrinsic_params = tf.compat.v1.trainable_variables("intrinsic")
        self.policy_new_fn = MlpPolicyNew
Ejemplo n.º 26
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        self.rep_loss = None
        # explicitly create  vector space for latent vectors
        latent_space = Box(-np.inf, np.inf, shape=(256, ))
        # So that I can compute the saliency map
        if Config.REPLAY:
            X = tf.compat.v1.placeholder(shape=(nbatch, ) + ob_space.shape,
                                         dtype=np.float32,
                                         name='Ob')
            processed_x = X
        else:
            X, processed_x = observation_input(ob_space, None)
            TRAIN_NUM_STEPS = Config.NUM_STEPS // 16
            REP_PROC = tf.compat.v1.placeholder(dtype=tf.float32,
                                                shape=(None, 64, 64, 3),
                                                name='Rep_Proc')
            Z_INT = tf.compat.v1.placeholder(dtype=tf.int32,
                                             shape=(),
                                             name='Curr_Skill_idx')
            Z = tf.compat.v1.placeholder(dtype=tf.float32,
                                         shape=(nbatch, Config.N_SKILLS),
                                         name='Curr_skill')
            CODES = tf.compat.v1.placeholder(dtype=tf.float32,
                                             shape=(1024, Config.N_SKILLS),
                                             name='Train_Codes')
            CLUSTER_DIMS = 256
            HIDDEN_DIMS_SSL = 256
            STEP_BOOL = tf.placeholder(tf.bool, shape=[])
            self.protos = tf.compat.v1.Variable(
                initial_value=tf.random.normal(shape=(CLUSTER_DIMS,
                                                      Config.N_SKILLS)),
                trainable=True,
                name='Prototypes')
            self.A = self.pdtype.sample_placeholder([None], name='A')
            self.R = tf.compat.v1.placeholder(tf.float32, [None], name='R')
            # trajectories of length m, for N policy heads.
            self.STATE = tf.compat.v1.placeholder(tf.float32,
                                                  [None, 64, 64, 3])
            self.STATE_NCE = tf.compat.v1.placeholder(
                tf.float32, [Config.REP_LOSS_M, 1, None, 64, 64, 3])
            self.ANCH_NCE = tf.compat.v1.placeholder(tf.float32,
                                                     [None, 64, 64, 3])
            # labels of Q value quantile bins
            self.LAB_NCE = tf.compat.v1.placeholder(
                tf.float32, [Config.POLICY_NHEADS, None])
            self.A_i = self.pdtype.sample_placeholder(
                [None, Config.REP_LOSS_M, 1], name='A_i')
            self.R_cluster = tf.compat.v1.placeholder(tf.float32, [None])
            self.A_cluster = self.pdtype.sample_placeholder(
                [None, Config.NUM_ENVS], name='A_cluster')

        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn(
                processed_x)
            self.train_dropout_assign_ops = fast_dropout_assigned_ops
            self.run_dropout_assign_ops = slow_dropout_assign_ops
            self.h = tf.concat([act_condit, act_invariant], axis=1)
        """
        Bisimulation code
        """
        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            # encoder loss
            act_one_hot_target = tf.reshape(tf.one_hot(self.A, ac_space.n),
                                            (-1, ac_space.n))
            pred_next_latent_mu1 = get_transition_model()(tf.concat(
                [self.h, act_one_hot_target], axis=1))
            pred_next_latent_mu2 = shuffle_custom(pred_next_latent_mu1)

            z_dist = tf.reduce_mean(
                tf.compat.v1.losses.huber_loss(
                    self.h,
                    shuffle_custom(self.h),
                    reduction=tf.compat.v1.losses.Reduction.NONE), 1)
            r_dist = tf.compat.v1.losses.huber_loss(
                self.R,
                shuffle_custom(self.R),
                reduction=tf.compat.v1.losses.Reduction.NONE)
            transition_dist = tf.reduce_mean(
                tf.compat.v1.losses.huber_loss(
                    pred_next_latent_mu1,
                    pred_next_latent_mu2,
                    reduction=tf.compat.v1.losses.Reduction.NONE), 1)

            bisimilarity = r_dist + Config.GAMMA * transition_dist
            self.encoder_bisimilarity_loss = tf.reduce_mean(
                tf.math.pow(z_dist - bisimilarity, 2))

            # latent loss
            pred_next_latent_mu1_3d = tf.transpose(
                tf.reshape(pred_next_latent_mu1, [-1, Config.NUM_ENVS, 256]),
                (1, 0, 2))  # 32 x n_timesteps x n_hidden
            h_3d = tf.transpose(tf.reshape(self.h, [-1, Config.NUM_ENVS, 256]),
                                (1, 0, 2))  # 32 x n_timesteps x n_hidden
            pred_next_latent_mu1 = pred_next_latent_mu1_3d[:, :
                                                           -1, :]  # t = 0 to n_timesteps-1
            next_h = h_3d[:, 1:, :]  # t = 1 to n_timesteps
            diff = (pred_next_latent_mu1 - tf.stop_gradient(next_h))
            self.latent_transition_loss = tf.reduce_mean(0.5 *
                                                         tf.math.pow(diff, 2))

        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):

            with tf.compat.v1.variable_scope("head_0",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                self.pd_train = [
                    self.pdtype.pdfromlatent(tf.stop_gradient(self.h),
                                             init_scale=0.01)[0]
                ]

            self.vf_train = [fc(self.h, 'v_0', 1)[:, 0]]

            # Plain Dropout version: Only fast updates / stochastic latent for VIB
            self.pd_run = self.pd_train
            self.vf_run = self.vf_train

            # For Dropout: Always change layer, so slow layer is never used
            self.run_dropout_assign_ops = []

        # Use the current head for classical PPO updates
        a0_run = [
            self.pd_run[head_idx].sample()
            for head_idx in range(Config.POLICY_NHEADS)
        ]
        neglogp0_run = [
            self.pd_run[head_idx].neglogp(a0_run[head_idx])
            for head_idx in range(Config.POLICY_NHEADS)
        ]
        self.initial_state = None

        def step(ob,
                 update_frac,
                 skill_idx=None,
                 one_hot_skill=None,
                 nce_dict={},
                 *_args,
                 **_kwargs):
            if Config.REPLAY:
                ob = ob.astype(np.float32)

            head_idx = 0
            a, v, neglogp = sess.run([
                a0_run[head_idx], self.vf_run[head_idx], neglogp0_run[head_idx]
            ], {X: ob})
            return a, v, self.initial_state, neglogp

        def rep_vec(ob, *_args, **_kwargs):
            return sess.run(self.h, {X: ob})

        def value(ob, update_frac, one_hot_skill=None, *_args, **_kwargs):
            if Config.AGENT == 'ppo_diayn':
                return sess.run(self.vf_run, {X: ob, Z: one_hot_skill})
            elif Config.AGENT == 'ppo_goal':
                return sess.run(self.vf_run, {REP_PROC: ob, Z: one_hot_skill})
            else:
                return sess.run(self.vf_run, {self.STATE: ob, X: ob})

        def value_i(ob, update_frac, one_hot_skill=None, *_args, **_kwargs):
            if Config.AGENT == 'ppo_diayn':
                return sess.run(self.vf_i_run, {X: ob, Z: one_hot_skill})
            elif Config.AGENT == 'ppo_goal':
                return sess.run(self.vf_i_run, {
                    REP_PROC: ob,
                    Z: one_hot_skill
                })
            else:
                return sess.run(self.vf_i_run, {self.STATE: ob, X: ob})

        def nce_fw_pass(nce_dict):
            return sess.run([self.vf_i_run, self.rep_loss], nce_dict)

        def custom_train(ob, rep_vecs):
            return sess.run([self.rep_loss], {X: ob, REP_PROC: rep_vecs})[0]

        def compute_codes(ob, act):
            return sess.run([
                tf.reshape(self.codes,
                           (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                tf.reshape(self.u_t, (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                tf.reshape(self.z_t_1,
                           (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                self.h_codes[:, 1:]
            ], {
                REP_PROC: ob,
                self.A_cluster: act
            })

        def compute_hard_codes(ob):
            return sess.run([self.codes, self.u_t, self.z_t_1], {REP_PROC: ob})

        def compute_cluster_returns(returns):
            return sess.run([self.cluster_returns], {self.R_cluster: returns})

        self.X = X
        self.processed_x = processed_x
        self.step = step
        self.value = value
        self.value_i = value_i
        self.rep_vec = rep_vec
        self.custom_train = custom_train
        self.nce_fw_pass = nce_fw_pass
        self.encoder = choose_cnn
        self.REP_PROC = REP_PROC
        self.Z = Z
        self.compute_codes = compute_codes
        self.compute_hard_codes = compute_hard_codes
        self.compute_cluster_returns = compute_cluster_returns
        self.CODES = CODES
        self.STEP_BOOL = STEP_BOOL
Ejemplo n.º 27
0
    def __init__(self,
                 env,
                 observations,
                 goals,
                 latent,
                 estimate_q=False,
                 vf_latent=None,
                 sess=None,
                 **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        vf_latent = vf_latent if vf_latent is not None else latent

        vf_latent = tf.layers.flatten(vf_latent)
        latent = tf.layers.flatten(latent)

        # Based on the action space, will select what probability distribution type
        self.pdtype = make_pdtype(env.action_space)

        if goals is not None:
            self.goals = goals
            addition_layers = False
            activ = tf.nn.tanh
            nh = 256
            if addition_layers:
                latent = tf.layers.dense(latent, units=nh, activation=activ)
                vf_latent = tf.layers.dense(vf_latent,
                                            units=nh,
                                            activation=activ)

        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)

        # Take an action
        self.action = self.pd.sample()

        # Calculate the neg log of our probability
        self.neglogp = self.pd.neglogp(self.action)
        self.sess = sess or tf.get_default_session()

        if estimate_q:
            assert isinstance(env.action_space, gym.spaces.Discrete)
            self.q = fc(vf_latent, 'q', env.action_space.n)
            self.vf = self.q
        else:
            self.vf = fc(vf_latent, 'vf', 1)
            self.vf = self.vf[:, 0]
Ejemplo n.º 28
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "vffc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0),
                      weight_loss_dict={}))
        self.vpred = dense(last_out,
                           1,
                           "vffinal",
                           weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "polfc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0),
                      weight_loss_dict={}))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense(last_out,
                         pdtype.param_shape()[0] // 2, "polfinal",
                         U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = dense(last_out,
                            pdtype.param_shape()[0], "polfinal",
                            U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Ejemplo n.º 29
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              use_bias=True,
              use_critic=True,
              seed=None,
              hidden_W_init=U.normc_initializer(1.0),
              hidden_b_init=tf.zeros_initializer(),
              output_W_init=U.normc_initializer(0.01),
              output_b_init=tf.zeros_initializer()):
        """Params:
            ob_space: task observation space
            ac_space : task action space
            hid_size: width of hidden layers
            num_hid_layers: depth
            gaussian_fixed_var: True->separate parameter for logstd, False->two-headed mlp
            use_bias: whether to include bias in neurons
        """
        assert isinstance(ob_space, gym.spaces.Box)

        if isinstance(hid_size, list):
            num_hid_layers = len(hid_size)
        else:
            hid_size = [hid_size] * num_hid_layers

        if seed is not None:
            tf.set_random_seed(seed)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        #Critic
        if use_critic:
            with tf.variable_scope('vf'):
                obz = tf.clip_by_value(
                    (ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
                last_out = obz
                for i in range(num_hid_layers):
                    last_out = tf.nn.tanh(
                        tf.layers.dense(last_out,
                                        hid_size[i],
                                        name="fc%i" % (i + 1),
                                        kernel_initializer=hidden_W_init))
                self.vpred = tf.layers.dense(
                    last_out,
                    1,
                    name='final',
                    kernel_initializer=hidden_W_init)[:, 0]

        #Actor
        with tf.variable_scope('pol'):
            last_out = tf.clip_by_value(
                (ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(last_out,
                                    hid_size[i],
                                    name='fc%i' % (i + 1),
                                    kernel_initializer=hidden_W_init,
                                    use_bias=use_bias))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                self.mean = mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=output_W_init,
                    use_bias=use_bias)
                self.logstd = logstd = tf.get_variable(
                    name="pol_logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=output_b_init)
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out,
                                          pdtype.param_shape()[0],
                                          name='final',
                                          kernel_initializer=output_W_init)

        #Acting
        self.pd = pdtype.pdfromflat(pdparam)
        self.state_in = []
        self.state_out = []
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        if use_critic:
            self._act = U.function([stochastic, ob], [ac, self.vpred])
        else:
            self._act = U.function([stochastic, ob], [ac, tf.zeros(1)])

        #Evaluating
        self.ob = ob
        self.ac_in = U.get_placeholder(name="ac_in",
                                       dtype=ac_space.dtype,
                                       shape=[sequence_length] +
                                       list(ac_space.shape))
        self.gamma = U.get_placeholder(name="gamma",
                                       dtype=tf.float32,
                                       shape=[])
        self.rew = U.get_placeholder(name="rew",
                                     dtype=tf.float32,
                                     shape=[sequence_length] + [1])
        self.logprobs = self.pd.logp(self.ac_in)  #  [\log\pi(a|s)]

        #Fisher
        with tf.variable_scope('pol') as vs:
            self.weights = weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \
                                         scope=vs.name)
        self.flat_weights = flat_weights = tf.concat(
            [tf.reshape(w, [-1]) for w in weights], axis=0)
        self.n_weights = flat_weights.shape[0].value
        self.score = score = U.flatgrad(self.logprobs,
                                        weights)  # \nabla\log p(\tau)
        self.fisher = tf.einsum('i,j->ij', score, score)

        #Performance graph initializations
        self._setting = []
Ejemplo n.º 30
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                # logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                logstd = tf.multiply(
                    tf.ones(shape=[1, pdtype.param_shape()[0] // 2]),
                    tf.constant(0.5 / ac_space.shape[0]))
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Ejemplo n.º 31
0
    def __init__(self, scope, ob_space, ac_space, ob_mean, ob_std, perception,
                 feat_spec, policy_spec, activation, layernormalize,
                 batchnormalize, add_noise, keep_noise, noise_std,
                 transfer_load, num_layers, keep_dim, transfer_dim, vf_coef,
                 coinrun):

        # warnings
        # i do not want to accidentally pass layernormalize and batchnormalize
        # for coinrun
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features."
            )
            raise NotImplementedError()

        if batchnormalize:
            print(
                "Warning: policy is operating on top of batch-normed features."
            )
            raise NotImplementedError()

        self.transfer_load = transfer_load
        self.transfer_dim = transfer_dim

        self.num_layers = num_layers
        self.keep_dim = keep_dim

        self.coinrun = coinrun

        self.ob_mean = ob_mean
        self.ob_std = ob_std

        self.add_noise = add_noise
        self.keep_noise = keep_noise
        self.noise_std = noise_std

        self.layernormalize = layernormalize
        self.batchnormalize = batchnormalize

        self.vf_coef = vf_coef

        input_shape = ob_space.shape

        # perception module
        self.perception = perception

        # feature dimensions (HARD-CODED NOT GOOD)
        self.feat_dim = 512

        # policy module
        self.feat_spec = feat_spec
        self.policy_spec = policy_spec

        self.activation = activation

        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(ac_space)

            # placeholders
            dtype = ob_space.dtype
            if dtype == np.int8:
                dtype = np.uint8
            print('policy.py, class Policy, def __init__, dtype: {}'.format(
                dtype))

            # taken from baselines.common.input import observation_input
            self.ph_ob = tf.to_float(
                tf.placeholder(dtype=ob_space.dtype,
                               shape=(None, ) + ob_space.shape,
                               name='ob'))

            self.ph_ac = self.ac_pdtype.sample_placeholder([None], name='ac')
            self.pd = self.vpred = None
            self.scope = scope
            self.pdparamsize = self.ac_pdtype.param_shape()[0]

            with tf.variable_scope(self.scope + '_representation',
                                   reuse=False):
                self.unflattened_out = self.get_out(self.ph_ob, reuse=False)
                out = utils.flatten(self.unflattened_out)
                print(
                    'policy.py, class Policy, def __init__, self.out.shape: {}'
                    .format(out.shape))
                # we get features (feat_dim 512)
                self.features = self.get_features(out, reuse=False)

            pdparam, self.vpred = self.get_policy(self.features, reuse=False)
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
            self.logits = pdparam

            print(
                'policy.py, class Policy, def __init__, pdparam.shape: {}, pdparam.dtype: {}'
                .format(pdparam.shape, pdparam.dtype))
            print(
                'policy.py, class Policy, def __init__, self.vpred: {}'.format(
                    self.vpred.shape))
            print('policy.py, class Policy, def __init__, self.a_samp: {}'.
                  format(self.a_samp.shape))
            print(
                'policy.py, class Policy, def __init__, self.entropy.shape: {}'
                .format(self.entropy.shape))
            print(
                'policy.py, class Policy, def __init__, self.nlp_samp.shape: {}'
                .format(self.nlp_samp.shape))
            print(
                'policy.py, class Policy, def __init__, self.logits.shape: {}'.
                format(self.logits.shape))
Ejemplo n.º 32
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 K=32,
                 reuse=False,
                 M=None):  #pylint: disable=W0613
        assert M is not None
        ob_shape = (nbatch, ) + ob_space.shape
        actdim = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape, name='Ob')  #obs
        act = tf.tanh
        with tf.variable_scope("model", reuse=reuse):
            h1 = act(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = act(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            pi = fc(h2, 'pi', actdim, init_scale=0.01)
            h1 = act(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = act(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(h2, 'vf', K)  #[:,0]
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, actdim],
                                     initializer=tf.zeros_initializer())

        # reparameterize actions
        noise = tf.random_normal([nbatch, M, actdim])
        mu = tf.expand_dims(pi, axis=1)
        std = tf.expand_dims(tf.exp(pi * 0.0 + logstd), axis=1)
        a_reparameterized = mu + std * noise

        # sample actions
        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        # distributional info
        self.K = K
        vf_mean = tf.reduce_mean(vf, axis=-1)

        def step(ob, *_args, **_kwargs):
            a, v, neglogp, batchactions, v_avg = sess.run(
                [a0, vf, neglogp0, a_reparameterized, vf_mean], {X: ob})
            return a, v, self.initial_state, neglogp, batchactions, v_avg

        def value(ob, *_args, **_kwargs):
            return sess.run(vf_mean, {X: ob})

        self.a0 = a0
        self.X = X
        self.pi = pi
        self.vf = vf
        self.vf_mean = vf_mean
        self.step = step
        self.value = value
        self.a_reparameterized = a_reparameterized
Ejemplo n.º 33
0
    def _init(self, ob_space, ac_space, hid_dims_p, hid_dims_v, train=True):
        assert isinstance(ob_space, gym.spaces.Box)
        self.pdtype = pdtype = make_pdtype(ac_space)
        self.ob_space = ob_space
        self.ac_space = ac_space
        self.hid_dims_p = hid_dims_p
        self.hid_dims_v = hid_dims_v
        # # with tf.variable_scope('rms'):
        # #     self.ob_rms = RunningMeanStd(dtype=tf.float32, shape=ob_space.shape)
        # with tf.variable_scope('cnn'):
        #     self.ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape))
        #     # self.obz = tf.clip_by_value((self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        #     self.x = self.ob/255.0
        #
        #     self.x = tf.nn.relu(U.conv2d(self.x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
        #     self.x = tf.nn.relu(U.conv2d(self.x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
        #     self.x = tf.nn.relu(U.conv2d(self.x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
        #     self.x = U.flattenallbut0(self.x)
        #     self.x = tf.nn.relu(tf.layers.dense(self.x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        #
        # odim = self.x.shape[-1]
        # odim = int(odim)
        # # print(self.ac_space.shape)
        # adim = pdtype.param_shape()[0]

        with tf.variable_scope("cnn"):
            self.ob = U.get_placeholder(name="ob",
                                        dtype=tf.float32,
                                        shape=[None] + list(ob_space.shape))
            # self.obz = tf.clip_by_value((self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            self.x = self.ob / 255.0

            W_conv1 = tf.Variable(
                tf.truncated_normal([5, 5, 4, 32], stddev=0.1))
            b_conv1 = tf.Variable(tf.constant(0.1, shape=[32]))
            h_conv1 = tf.nn.relu(
                tf.nn.conv2d(
                    self.x, W_conv1, strides=[1, 1, 1, 1], padding="SAME") +
                b_conv1)
            h_pool1 = tf.nn.max_pool(h_conv1,
                                     ksize=[1, 4, 4, 1],
                                     strides=[1, 4, 4, 1],
                                     padding='SAME')

            W_conv2 = tf.Variable(
                tf.truncated_normal([5, 5, 32, 64], stddev=0.1))
            b_conv2 = tf.Variable(tf.constant(0.1, shape=[64]))
            h_conv2 = tf.nn.relu(
                tf.nn.conv2d(
                    h_pool1, W_conv2, strides=[1, 1, 1, 1], padding="SAME") +
                b_conv2)
            h_pool2 = tf.nn.max_pool(h_conv2,
                                     ksize=[1, 4, 4, 1],
                                     strides=[1, 4, 4, 1],
                                     padding='SAME')

            ## 全连接层,隐含层的节点个数为
            W_fc1 = tf.Variable(
                tf.truncated_normal([8 * 8 * 64, 1024], stddev=0.1))
            b_fc1 = tf.Variable(tf.constant(0.1, shape=[1024]))
            h_pool3_flat = tf.reshape(h_pool2, [-1, 8 * 8 * 64])
            # 将2D图像变成1D数据[n_samples,64,64,64]->>[n_samples,64*64]
            h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, W_fc1) +
                               b_fc1)  # 非线性激活函数
            # h_fc1 = tf.matmul(h_pool3_flat, W_fc1) + b_fc1  # 非线性激活函数
            h_fc1_drop = tf.nn.dropout(h_fc1, 0.5)  # 防止过拟合

            self.x = tf.nn.relu(
                tf.layers.dense(h_fc1_drop,
                                512,
                                name="polfc",
                                kernel_initializer=U.normc_initializer(1.0)))

        odim = self.x.shape[-1]
        odim = int(odim)
        # print(self.ac_space.shape)
        adim = pdtype.param_shape()[0]

        with tf.variable_scope('pol'):
            self._policy_nn(odim, adim, train)
        with tf.variable_scope('vf'):
            self._vf_nn(odim, adim, train)
Ejemplo n.º 34
0
    def __init__(self, tf_session, ob_space, ac_space, nbatch,
                 reward_redistribution_config, observation_network_config, lstm_network_config, training_config,
                 exploration_config, nsteps, nlstm=64, reuse=False):
        """LSTM policy network, as described in RUDDER paper
        
        Based on baselines.ppo2.policies.py; LSTM layer sees features from it's own trainable observation network and
        the features from the reward redistribution observation network;
        
        Parameters
        -------
        tf_session : tensorflow session
            tensorflow session to compute the graph in
        ob_space
            Baselines ob_space object (see ppo2_rudder.py); must provide .shape attribute for (x, y, c) shapes;
        ac_space
            Baselines ac_space object (see ppo2_rudder.py); must provide .n attribute for number of possible actions;
        nbatch : int
            Batchsize
        nsteps : int
            Fixed number of timesteps to process at once
        reward_redistribution_config : dict
            Dictionary containing config for reward redistribution:
            -----
            lambda_eligibility_trace : float
                Eligibility trace value for redistributed reward
            vf_contrib : float
                Weighting of original value function (vf) vs. redistributed reward (rr), s.t.
                :math:`reward = vf \cdot vf\_contrib + rr \cdot (1-vf\_contrib)`
            use_reward_redistribution_quality_threshold : float
                Quality of reward redistribution has to exceed use_reward_redistribution_quality_threshold to be used;
                use_reward_redistribution_quality_threshold range is [0,1]; Quality measure is the squared prediction
                error, as described in RUDDER paper;
            use_reward_redistribution : bool
                Use reward redistribution?
            rr_junksize : int
                Junksize for reward redistribution; Junks overlap by 1 half each
            cont_pred_w : float
                Weighting of continous prediciton loss vs. prediction loss of final return at last timestep
            intgrd_steps : int
                Stepsize for integrated gradients
            intgrd_batchsize : int
                Integrated gradients is computed batch-wise if intgrd_batchsize > 1
        observation_network_config : dict
            Dictionary containing config for observation network that processes observations and feeds them to LSTM
            network:
            -----
            show_states : bool
                Show frames to network?
            show_statedeltas : bool
                Show frame deltas to network?
            prepoc_states : list of dicts
                Network config to preprocess frames
            prepoc_deltas : list of dicts
                Network config to preprocess frame deltas
            prepoc_observations : list of dicts
                Network config to preprocess features from frame and frame-delta preprocessing networks
        lstm_network_config : dict
            Dictionary containing config for LSTM network:
            -----
            show_actions : bool
                Show taken actions to LSTM?
            reversed : bool
                Process game sequence in reversed order?
            layers : list of dicts
                Network config for LSTM network and optional additional dense layers
            initializations : dict
                Initialization config for LSTM network
            timestep_encoding : dict
                Set "max_value" and "triangle_span" for TeLL.utiltiy.misc_tensorflow.TriangularValueEncoding class
        training_config : dict
            Dictionary containing config for training and update procedure:
            -----
            n_no_rr_updates : int
                Number of updates to perform without training or using reward redistribution network
            n_pretrain_games : int
                Number of games to pretrain the reward redistribution network without using it;
            downscale_lr_policylag : bool
                Downscale learningrate permanently if policy lag gets too large?
            optimizer : tf.train optimizer
                Optimizer in tf.train, e.g. "AdamOptimizer"
            optimizer_params : dict
                Kwargs for optimizer
            l1 : float
                Weighting for l1 weight regularization
            l2 : float
                Weighting for l2 weight regularization
            clip_gradients : float
                Threshold for clipping gradients (clipping by norm)
        exploration_config : dict
            Dictionary containing config for exploration:
            -----
            sample_actions_from_softmax : bool
                True: Apply softmax to policy network output and use it as probabilities to pick an action
                False: Use the max. policy network output as action
            temporal_safe_exploration : bool
                User RUDDER safe exploration
            save_pi_threshold : float
                Threshold value in range [0,1] for safe actions in RUDDER safe exploration
        nlstm : int
            Number of LSTM units (=memory cells)
        reuse : bool
            Reuse tensorflow variables?
        """
        #
        # Shapes
        #
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        seq_ob_shape = (nenv, -1, nh, nw, 1)
        nact = ac_space.n
        
        #
        # Placeholders for inputs
        #
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        
        #
        # Prepare input
        #
        single_frames = tf.cast(tf.reshape(X[..., -1:], shape=seq_ob_shape), dtype=tf.float32)
        delta_frames = single_frames - tf.cast(tf.reshape(X[..., -2:-1], shape=seq_ob_shape), dtype=tf.float32)
        
        #
        #  Get observation features from RR model
        #
        rr_model = RewardRedistributionModel(reward_redistribution_config=reward_redistribution_config,
                                             observation_network_config=observation_network_config,
                                             lstm_network_config=lstm_network_config, training_config=training_config,
                                             scopename="RR")
        self.rr_observation_model = rr_model
        rr_observation_layer = rr_model.get_visual_features(single_frame=single_frames, delta_frame=delta_frames,
                                                            additional_inputs=[])
        
        #
        #  Build policy network
        #
        with tf.variable_scope("model", reuse=reuse):
            temperature = tf.get_variable(initializer=tf.constant(1, dtype=tf.float32), trainable=False,
                                          name='temperature')
            
            additional_inputs = [StopGradientLayer(rr_observation_layer)]
            observation_layers, observation_features = observation_network(
                    single_frame=single_frames, delta_frame=delta_frames, additional_inputs=additional_inputs,
                    observation_network_config=observation_network_config)
            
            self.observation_features_shape = observation_features.get_output_shape()
            
            xs = [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps,
                                                       value=tf.reshape(observation_layers[-1].get_output(),
                                                                        [nenv, nsteps, -1]))]
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            h6 = h5
            pi = fc(h6, 'pi', nact)
            vf = fc(h6, 'v', 1)
        
        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)
        
        if exploration_config['sample_actions_from_softmax']:
            a0 = self.pd.sample_temp(temperature=temperature)
        else:
            a0 = tf.argmax(pi, axis=-1)
        
        v0 = vf[:, 0]
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        
        def step(ob, state, mask):
            a, v, s, neglogp = tf_session.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
            return a, v, s, neglogp
        
        def value(ob, state, mask):
            return tf_session.run(v0, {X:ob, S:state, M:mask})
        
        def action(ob, state, mask, *_args, **_kwargs):
            a, s, neglogp = tf_session.run([a0, snew, neglogp0], {X:ob, S:state, M:mask})
            return a, s, neglogp
        
        #
        # Placeholders for exploration
        #
        n_envs = pi.shape.as_list()[0]
        exploration_timesteps_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,))
        prev_actions_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,))
        gamelengths_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,))
        keep_prev_action_pl = tf.placeholder(dtype=tf.bool, shape=(n_envs,))
        prev_action_count_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,))
        exploration_durations_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,))
        
        #
        # Setting up safe exploration
        #
        explore = tf.logical_and(tf.logical_and(tf.less_equal(exploration_timesteps_pl, gamelengths_pl),
                                                tf.less_equal(gamelengths_pl,
                                                              exploration_timesteps_pl + exploration_durations_pl)),
                                 tf.not_equal(exploration_timesteps_pl, tf.constant(-1, dtype=tf.float32)))

        safe_pi = pi - tf.reduce_min(pi, axis=-1, keep_dims=True)
        safe_pi /= tf.reduce_max(safe_pi, axis=-1, keep_dims=True)
        save_pi_thresholds = (1 - (tf.expand_dims(tf.range(n_envs, dtype=tf.float32), axis=1)
                                   / (n_envs + (n_envs == 1) - 1)) * (1 - exploration_config['save_pi_threshold']))
        safe_pi = tf.cast(tf.greater_equal(safe_pi, save_pi_thresholds), dtype=tf.float32)
        safe_pi /= tf.reduce_sum(safe_pi)
        
        rand_safe_a = tf.multinomial(safe_pi, 1)[:, 0]
        
        safe_pi_flat = tf.reshape(safe_pi, (-1,))
        prev_action_is_safe = tf.gather(safe_pi_flat,
                                        prev_actions_pl + tf.range(safe_pi.shape.as_list()[0], dtype=tf.int64)
                                        * safe_pi.shape.as_list()[1])
        prev_action_is_safe = tf.greater(prev_action_is_safe, tf.constant(0, dtype=tf.float32))
        
        a_explore = tf.where(tf.logical_and(tf.logical_and(keep_prev_action_pl,
                                                           tf.not_equal(gamelengths_pl, exploration_timesteps_pl)),
                                            prev_action_is_safe),
                             prev_actions_pl, rand_safe_a)
        
        a_explore = tf.where(explore, a_explore, a0)
        
        # Make sure the actor doesn't repeat an action too often (otherwise screensaver might start)
        rand_a = tf.random_uniform(shape=a0.get_shape(), minval=0, maxval=ac_space.n, dtype=a0.dtype)
        a_explore = tf.where(tf.greater(prev_action_count_pl, tf.constant(20, dtype=tf.int64)), rand_a, a_explore)
        
        if not exploration_config['temporal_safe_exploration']:
            a_explore = a0
            
        neglogp_explore = self.pd.neglogp(a_explore)
        
        def action_exploration(ob, state, mask, *_args, exploration_timesteps, prev_actions, gamelengths,
                               keep_prev_action, prev_action_count, exploration_durations, **_kwargs):
            """Get actions with exploration for long-term reward"""
            a, s, neglogp = tf_session.run([a_explore, snew, neglogp_explore],
                                  {X: ob, S:state, M:mask, exploration_timesteps_pl: exploration_timesteps,
                                   prev_actions_pl: prev_actions,
                                   gamelengths_pl: gamelengths, exploration_durations_pl: exploration_durations,
                                   keep_prev_action_pl: keep_prev_action, prev_action_count_pl: prev_action_count})
            return a, s, neglogp
        
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
        self.action = action
        self.action_exploration = action_exploration
        self.seq_ob_shape = seq_ob_shape
        self.exploration_config = exploration_config
Ejemplo n.º 35
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=False,
              popart=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[None] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope("popart"):
            self.v_rms = RunningMeanStd(shape=[1])

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "vffc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0)))
        self.norm_vpred = dense(last_out,
                                1,
                                "vffinal",
                                weight_init=U.normc_initializer(1.0))[:, 0]
        if popart:
            self.vpred = denormalize(self.norm_vpred, self.v_rms)
        else:
            self.vpred = self.norm_vpred

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "polfc%i" % (i + 1),
                      weight_init=U.normc_initializer(1.0)))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense(last_out,
                         pdtype.param_shape()[0] // 2, "polfinal",
                         U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = dense(last_out,
                            pdtype.param_shape()[0], "polfinal",
                            U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())

        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.mean_and_logstd = U.function([ob], [self.pd.mean, self.pd.logstd])

        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])

        self.use_popart = popart
        if popart:
            self.init_popart()

        ret = tf.placeholder(tf.float32, [None])
        vferr = tf.reduce_mean(tf.square(self.vpred - ret))
        self.vlossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr,
                                                  self.get_vf_variable()))
Ejemplo n.º 36
0
def _mlpPolicy(hiddens,
               ob,
               ob_space,
               ac_space,
               scope,
               gaussian_fixed_var=True,
               reuse=False):
    assert isinstance(ob_space, gym.spaces.Box)

    with tf.variable_scope(scope, reuse=reuse):
        pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        with tf.variable_scope("obfilter"):
            ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - ob_rms.mean) / ob_rms.std, -5.0, 5.0)
            last_out = obz
            #for i in range(num_hid_layers):
            for (i, hidden) in zip(range(len(hiddens)), hiddens):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hidden,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            #for i in range(num_hid_layers):
            #for hidden in hiddens:
            for (i, hidden) in zip(range(len(hiddens)), hiddens):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hidden,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        pd = pdtype.pdfromflat(pdparam)

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, pd.sample(), pd.mode())
        _act = U.function([stochastic, ob], [ac, vpred])

        return pd.logits, _act
Ejemplo n.º 37
0
    def __init__(self,
                 env,
                 observations,
                 latent,
                 estimate_q=False,
                 vf_latent=None,
                 sess=None,
                 **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        vf_latent = vf_latent if vf_latent is not None else latent

        vf_latent = tf.layers.flatten(vf_latent)
        latent = tf.layers.flatten(latent)

        # Based on the action space, will select what probability distribution type
        self.pdtype = make_pdtype(env.action_space)

        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)

        # Take an action
        self.action = self.pd.sample()

        # Calculate the neg log of our probability
        self.neglogp = self.pd.neglogp(self.action)
        self.sess = sess or tf.get_default_session()
        self.vf_latent = vf_latent
        if estimate_q:
            assert isinstance(env.action_space, gym.spaces.Discrete)
            self.q = fc(vf_latent, 'q', env.action_space.n)
            self.vf = self.q
        else:
            # value network
            batch_count = vf_latent.get_shape()[0].value
            train_switch = True
            if 1 != batch_count:
                train_switch = True

            my_initializer = tf.contrib.layers.xavier_initializer()
            nin = vf_latent.get_shape()[1].value
            fc1_W_v = tf.get_variable(shape=[nin, 1],
                                      name='value_head_weight',
                                      trainable=train_switch,
                                      initializer=my_initializer)

            fc1_b_v = tf.get_variable(shape=[1],
                                      name='value_head_bias',
                                      trainable=train_switch,
                                      initializer=tf.constant_initializer(0))

            tf.summary.histogram("value_head_weight", fc1_W_v)
            tf.summary.histogram("value_head_bias", fc1_b_v)

            self.vf = tf.matmul(vf_latent, fc1_W_v) + fc1_b_v

            #self.vf = fc(vf_latent, 'vf_weights', 1)
            # gerry_wonder
            self.vf = self.vf[:, 0]

        self.summary_tensor = None
        self.summary_writer = None
        self.step_id = 0
Ejemplo n.º 38
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 is_discrete=True):  #pylint: disable=W0613
        if isinstance(ac_space, gym.spaces.Discrete):
            self.is_discrete = True
        else:
            self.is_discrete = False

        print("nbatch%d" % (nbatch))

        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        if self.is_discrete:
            nact = ac_space.n
        else:
            nact = ac_space.shape[0]
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32) / 255.,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            pi = fc(h4, 'pi', nact, init_scale=0.01)
            vf = fc(h4, 'v', 1)[:, 0]

            if not self.is_discrete:
                logstd = tf.get_variable(name="logstd",
                                         shape=[1, nact],
                                         initializer=tf.zeros_initializer())

        self.pdtype = make_pdtype(ac_space)
        if self.is_discrete:
            self.pd = self.pdtype.pdfromflat(pi)
            a0 = self.pd.sample()
        else:
            pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
            self.pd = self.pdtype.pdfromflat(pdparam)
            a0 = self.pd.sample()

        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            assert (a.shape[0] == 1
                    )  # make sure a = a[0] don't throw away actions
            a = a[0]
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 39
0
    def _init(self, ob_space, sensor_space, ac_space, hid_size, num_hid_layers,
              kind):
        assert isinstance(ob_space, gym.spaces.Box)
        assert isinstance(sensor_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        ob_sensor = U.get_placeholder(name="ob_sensor",
                                      dtype=tf.float32,
                                      shape=[sequence_length] +
                                      list(sensor_space.shape))

        ## Obfilter on sensor output
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=sensor_space.shape)

        obz_sensor = tf.clip_by_value(
            (ob_sensor - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        #x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))

        ## Adapted from mlp_policy
        last_out = obz_sensor
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                tf.layers.dense(last_out,
                                hid_size,
                                name="vffc%i" % (i + 1),
                                kernel_initializer=U.normc_initializer(1.0)))
        y = tf.layers.dense(last_out,
                            64,
                            name="vffinal",
                            kernel_initializer=U.normc_initializer(1.0))

        #y = ob_sensor
        #y = obz_sensor
        #y = tf.nn.relu(U.dense(y, 64, 'lin_ob', U.normc_initializer(1.0)))

        x = ob / 255.0
        if kind == 'small':  # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(
                tf.layers.dense(x,
                                256,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large':  # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(
                tf.layers.dense(x,
                                64,
                                name='lin',
                                kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        print(x.shape, y.shape)
        x = tf.concat([x, y], 1)

        ## Saver
        # self.saver = tf.train.Saver()

        logits = tf.layers.dense(x,
                                 pdtype.param_shape()[0],
                                 name="logits",
                                 kernel_initializer=U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(
            x, 1, name="value", kernel_initializer=U.normc_initializer(1.0))[:,
                                                                             0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample()  # XXX
        self._act = U.function([stochastic, ob, ob_sensor],
                               [ac, self.vpred, logits])
Ejemplo n.º 40
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              bound_by_sigmoid=False,
              sigmoid_coef=1.,
              activation='tanh',
              normalize_obs=True,
              actions='gaussian',
              avg_norm_symmetry=False,
              symmetric_interpretation=False,
              stdclip=5.0,
              gaussian_bias=False,
              gaussian_from_binary=False,
              parallel_value=False,
              pv_layers=2,
              pv_hid_size=512,
              three=False):
        assert isinstance(ob_space, gym.spaces.Box)

        if actions == 'binary':
            self.pdtype = pdtype = MultiCategoricalPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32))
        elif actions == 'beta':
            self.pdtype = pdtype = BetaPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32))
        elif actions == 'bernoulli':
            self.pdtype = pdtype = BernoulliPdType(ac_space.low.size)
        elif actions == 'gaussian':
            self.pdtype = pdtype = make_pdtype(ac_space)
        elif actions == 'cat_3':
            self.pdtype = pdtype = MultiCategoricalPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32) * 2)
        elif actions == 'cat_5':
            self.pdtype = pdtype = MultiCategoricalPdType(
                low=np.zeros_like(ac_space.low, dtype=np.int32),
                high=np.ones_like(ac_space.high, dtype=np.int32) * 4)
        else:
            assert False

        sequence_length = None

        self.ob = U.get_placeholder(name="ob",
                                    dtype=tf.float32,
                                    shape=[sequence_length] +
                                    list(ob_space.shape))
        self.st = U.get_placeholder(name="st", dtype=tf.int32, shape=[None])

        if normalize_obs:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape)
            if avg_norm_symmetry:
                # Warning works only for normal observations (41 numbers)
                ob_mean = (tf.gather(self.ob_rms.mean, ORIG_SYMMETRIC_IDS) +
                           self.ob_rms.mean) / 2
                ob_std = (tf.gather(self.ob_rms.std, ORIG_SYMMETRIC_IDS) +
                          self.ob_rms.std) / 2  # Pretty crude
            else:
                ob_mean = self.ob_rms.mean
                ob_std = self.ob_rms.std

            obz = tf.clip_by_value((self.ob - ob_mean) / ob_std, -stdclip,
                                   stdclip)

            #obz = tf.Print(obz, [self.ob_rms.mean], message='rms_mean', summarize=41)
            #obz = tf.Print(obz, [self.ob_rms.std], message='rms_std', summarize=41)
        else:
            obz = self.ob

        vpreds = []
        pparams = []

        for part in range(1 if not three else 3):
            part_prefix = "" if part == 0 else "part_" + str(part)

            # Predicted value
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    U.dense(last_out,
                            hid_size,
                            part_prefix + "vffc%i" % (i + 1),
                            weight_init=U.normc_initializer(1.0)))

            vpreds.append(
                U.dense(last_out,
                        1,
                        part_prefix + "vffinal",
                        weight_init=U.normc_initializer(1.0)))
            vpreds[-1] = vpreds[-1][:, 0]

            if parallel_value:
                last_out_2 = obz
                for i in range(pv_layers):
                    last_out_2 = tf.nn.tanh(
                        U.dense(last_out_2,
                                pv_hid_size,
                                part_prefix + "pv_vffc%i" % (i + 1),
                                weight_init=U.normc_initializer(1.0)))
                last_out_2 = U.dense(last_out_2,
                                     1,
                                     part_prefix + "pv_vffinal",
                                     weight_init=U.normc_initializer(1.0))
                vpreds[-1] += last_out_2[:, 0]

            last_out = obz
            if activation == 'tanh': activation = tf.nn.tanh
            elif activation == 'relu': activation = tf.nn.relu
            for i in range(num_hid_layers):
                dense = U.dense(last_out,
                                hid_size,
                                part_prefix + "polfc%i" % (i + 1),
                                weight_init=U.normc_initializer(1.0))
                last_out = activation(dense)

            if actions == 'gaussian':
                if gaussian_fixed_var:
                    mean = U.dense(last_out,
                                   pdtype.param_shape()[0] // 2,
                                   part_prefix + "polfinal",
                                   U.normc_initializer(0.01))
                    if bound_by_sigmoid:
                        mean = tf.nn.sigmoid(mean * sigmoid_coef)
                    logstd = tf.get_variable(
                        name=part_prefix + "logstd",
                        shape=[1, pdtype.param_shape()[0] // 2],
                        initializer=tf.zeros_initializer())
                    logstd = mean * 0.0 + logstd
                else:
                    mean = U.dense(last_out,
                                   pdtype.param_shape()[0] // 2,
                                   part_prefix + "polfinal",
                                   U.normc_initializer(0.01))
                    logstd = U.dense(last_out,
                                     pdtype.param_shape()[0] // 2,
                                     part_prefix + "polfinal_2",
                                     U.normc_initializer(0.01))
                if gaussian_bias:
                    mean = mean + 0.5

                pdparam = U.concatenate([mean, logstd], axis=1)
            elif actions == 'beta':
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "beta_lastlayer",
                                  U.normc_initializer(0.01))
                pdparam = tf.nn.softplus(pdparam)
            elif actions in ['bernoulli', 'binary']:
                if bound_by_sigmoid:
                    raise NotImplementedError(
                        "bound by sigmoid not implemented here")
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "polfinal",
                                  U.normc_initializer(0.01))
            elif actions in ['cat_3']:
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "cat3_lastlayer",
                                  U.normc_initializer(0.01))
                # prob = tf.reshape(pdparam, [18, -1])
                # prob = tf.nn.softmax(prob)
                # elogit = tf.exp(pdparam)
                # pdparam = tf.Print(pdparam, [prob], summarize=18)
            elif actions in ['cat_5']:
                pdparam = U.dense(last_out,
                                  pdtype.param_shape()[0],
                                  part_prefix + "cat5_lastlayer",
                                  U.normc_initializer(0.01))
                # prob = tf.reshape(pdparam, [18, -1])
                # prob = tf.nn.softmax(prob)
                # elogit = tf.exp(pdparam)
                # pdparam = tf.Print(pdparam, [prob], summarize=18)
            else:
                assert False

            pparams.append(pdparam)

        pparams = tf.stack(pparams)
        vpreds = tf.stack(vpreds)
        pparams = tf.transpose(pparams,
                               perm=(1, 0, 2))  # [batchsize, networks, values]
        vpreds = tf.transpose(vpreds,
                              perm=(1, 0))  # [batchsize, networks, values]

        self.stochastic = tf.placeholder(name="stochastic",
                                         dtype=tf.bool,
                                         shape=())

        if three:
            batchsize = tf.shape(pdparam)[0]
            NO_OBSTACLES_ID = 5
            OBST_DIST = [278, 279, 280, 281, 282, 283, 284,
                         285]  # TODO: Alternative approach
            distances = [self.ob[:, i] for i in OBST_DIST]
            distances = tf.stack(distances, axis=1)
            no_obstacles = tf.cast(tf.equal(self.ob[:, NO_OBSTACLES_ID], 1.0),
                                   tf.int32)
            distances = tf.cast(tf.reduce_all(tf.equal(distances, 3), axis=1),
                                tf.int32)
            no_obstacles_ahead = distances * no_obstacles  # 0 if obstacles, 1 if no obstacles
            begin = tf.cast(tf.less(self.st, 75), tf.int32)
            take_id = (1 - begin) * (
                1 + no_obstacles_ahead
            )  # begin==1 => 0, begin==0 => 1 + no_obstacles_ahead

            take_id = tf.stack((tf.range(batchsize), take_id), axis=1)
            pdparam = tf.gather_nd(pparams, take_id)

            self.vpred = tf.gather_nd(vpreds, take_id)
            #self.vpred = tf.Print(self.vpred, [take_id])
        else:
            self.vpred = vpreds[:, 0]
            pdparam = pparams[:, 0]

        self.pd = pdtype.pdfromflat(pdparam)

        if hasattr(self.pd, 'real_mean'):
            real_mean = self.pd.real_mean()
            ac = U.switch(self.stochastic, self.pd.sample(), real_mean)
        else:
            ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode())

        self._act = U.function([self.stochastic, self.ob, self.st],
                               [ac, self.vpred, ob_mean, ob_std])

        if actions == 'binary':
            self._binary_f = U.function([self.stochastic, self.ob, self.st],
                                        [ac, self.pd.flat, self.vpred])
Ejemplo n.º 41
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, **conv_kwargs): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            #
            if Config.USE_COLOR_TRANSFORM:
                out_shape = processed_x.get_shape().as_list()

                mask_vbox = tf.Variable(tf.zeros_like(processed_x, dtype=bool), trainable=False)
                rh = .2 # hard-coded velocity box size
                # mh = tf.cast(tf.cast(out_shape[1], dtype=tf.float32)*rh, dtype=tf.int32)
                mh = int(out_shape[1]*rh)
                mw = mh*2
                mask_vbox = mask_vbox[:,:mh,:mw].assign(tf.ones([out_shape[0], mh, mw, out_shape[-1]], dtype=bool))
                masked = tf.where(mask_vbox, x=tf.zeros_like(processed_x), y=processed_x)

                # tf.image.adjust_brightness vs. ImageEnhance.Brightness
                # tf version is additive while PIL version is multiplicative
                delta_brightness = tf.get_variable(
                    name='randprocess_brightness',
                    initializer=tf.random_uniform([], -.5, .5),
                    trainable=False)

                # tf.image.adjust_contrast vs. PIL.ImageEnhance.Contrast
                delta_contrast = tf.get_variable(
                    name='randprocess_contrast',
                    initializer=tf.random_uniform([], .5, 1.5),
                    trainable=False,)

                # tf.image.adjust_saturation vs. PIL.ImageEnhance.Color
                delta_saturation = tf.get_variable(
                    name='randprocess_saturation',
                    initializer=tf.random_uniform([], .5, 1.5),
                    trainable=False,)

                processed_x1 = tf.image.adjust_brightness(masked, delta_brightness)
                processed_x1 = tf.clip_by_value(processed_x1, 0., 255.)
                processed_x1 = tf.where(mask_vbox, x=masked, y=processed_x1)
                processed_x2 = tf.image.adjust_contrast(processed_x1, delta_contrast)
                processed_x2 = tf.clip_by_value(processed_x2, 0., 255.)
                processed_x2 = tf.where(mask_vbox, x=masked, y=processed_x2)
                processed_x3 = tf.image.adjust_saturation(processed_x2, delta_saturation)
                processed_x3 = tf.clip_by_value(processed_x3, 0., 255.)
                processed_x3 = tf.where(mask_vbox, x=processed_x, y=processed_x3)
            else:
                processed_x3 = processed_x
            #
            h, self.dropout_assign_ops = choose_cnn(processed_x3)
            vf = fc(h, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 42
0
    def __init__(self,
                 env,
                 observations,
                 latent,
                 estimate_q=False,
                 vf_latent=None,
                 sess=None,
                 **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed
        
        策略网络的隐层,只是定义了计算图
        latent          latent state from which policy distribution parameters should be inferred
        
        值网络的隐层,如果共享隐层,策略网络的隐层 and 值网络的隐层相同

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        vf_latent = vf_latent if vf_latent is not None else latent

        # flatten 除了第一维,其他维展平
        vf_latent = tf.layers.flatten(vf_latent)
        latent = tf.layers.flatten(latent)
        #print('000000000000000000000000000000000000000000000000000000000000')
        #print(latent)
        # Based on the action space, will select what probability distribution type
        self.pdtype = make_pdtype(env.action_space)
        '''
        # pdtype根据不同action类型在隐层后接全连接
        # self.pd 是保存了动作概率分布的类,封装了很多方法可以用来计算与分布相关的量,比如下边计算的sample和neglogp
        '''
        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)
        # 根据概率分布采样动作
        # Take an action
        self.action = self.pd.sample()
        # 计算所采样动作的负log
        # Calculate the neg log of our probability
        self.neglogp = self.pd.neglogp(self.action)
        self.sess = sess or tf.get_default_session()

        if estimate_q:
            assert isinstance(env.action_space, gym.spaces.Discrete)
            self.q = fc(vf_latent, 'q', env.action_space.n)
            self.vf = self.q
        else:
            # 每个状态只有一个V值
            self.vf = fc(vf_latent, 'vf', 1)
            self.vf = self.vf[:, 0]
    def __init__(self, sess, ob_space, action_space, nbatch, nsteps, reuse = False):
        # This will use to initialize our kernels
        gain = np.sqrt(2)

        # Based on the action space, will select what probability distribution type
        # we will use to distribute action in our stochastic policy (in our case DiagGaussianPdType
        # aka Diagonal Gaussian, 3D normal distribution
        self.pdtype = make_pdtype(action_space)

        height, weight, channel = ob_space.shape
        ob_shape = (height, weight, channel)

        # Create the input placeholder
        inputs_ = tf.placeholder(tf.float32, [None, *ob_shape], name="input")

        # Normalize the images
        scaled_images = tf.cast(inputs_, tf.float32) / 255.

        """
        Build the model
        3 CNN for spatial dependencies
        Temporal dependencies is handle by stacking frames
        (Something funny nobody use LSTM in OpenAI Retro contest)
        1 common FC
        1 FC for policy
        1 FC for value
        """
        with tf.variable_scope("model", reuse = reuse):
            conv1 = conv_layer(scaled_images, 32, 8, 4, gain)
            conv2 = conv_layer(conv1, 64, 4, 2, gain)
            conv3 = conv_layer(conv2, 64, 3, 1, gain)
            flatten1 = tf.layers.flatten(conv3)
            fc_common = fc_layer(flatten1, 512, gain=gain)

            # This build a fc connected layer that returns a probability distribution
            # over actions (self.pd) and our pi logits (self.pi).
            self.pd, self.pi = self.pdtype.pdfromlatent(fc_common, init_scale=0.01)

            # Calculate the v(s)
            vf = fc_layer(fc_common, 1, activation_fn=None)[:, 0]

        self.initial_state = None

        # Take an action in the action distribution (remember we are in a situation
        # of stochastic policy so we don't always take the action with the highest probability
        # for instance if we have 2 actions 0.7 and 0.3 we have 30% chance to take the second)
        a0 = self.pd.sample()

        # Function use to take a step returns action to take and V(s)
        def step(state_in, *_args, **_kwargs):
            action, value = sess.run([a0, vf], {inputs_: state_in})
           
            #print("step", action)
            
            return action, value

        # Function that calculates only the V(s)
        def value(state_in, *_args, **_kwargs):
            return sess.run(vf, {inputs_: state_in})

        # Function that output only the action to take
        def select_action(state_in, *_args, **_kwargs):
            return sess.run(a0, {inputs_: state_in})

        self.inputs_ = inputs_
        self.vf = vf
        self.step = step
        self.value = value
        self.select_action = select_action