Ejemplo n.º 1
0
    def __init__(self,
                 env,
                 observations,
                 latent,
                 estimate_q=False,
                 vf_latent=None,
                 sess=None,
                 **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        vf_latent = vf_latent if vf_latent is not None else latent

        vf_latent = tf.layers.flatten(vf_latent)
        latent = tf.layers.flatten(latent)

        # Based on the action space, will select what probability distribution type
        self.pdtype = make_pdtype(env.action_space)

        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)

        # Take an action
        self.action = self.pd.sample()

        # Calculate the neg log of our probability
        self.neglogp = self.pd.neglogp(self.action)
        self.sess = sess or tf.get_default_session()

        if estimate_q:
            assert isinstance(env.action_space, gym.spaces.Discrete)
            self.q = fc(vf_latent, 'q', env.action_space.n)
            self.vf = self.q
        else:
            self.vf = fc(vf_latent, 'vf', 1)
            self.vf = self.vf[:, 0]
Ejemplo n.º 2
0
def nature_cnn(unscaled_images, **conv_kwargs):
    """
    CNN from Nature paper.
    """
    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
    activ = tf.nn.relu
    h = activ(
        conv(scaled_images,
             'c1',
             nf=32,
             rf=8,
             stride=4,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h2 = activ(
        conv(h,
             'c2',
             nf=64,
             rf=4,
             stride=2,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h3 = activ(
        conv(h2,
             'c3',
             nf=64,
             rf=3,
             stride=1,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h3 = conv_to_fc(h3)
    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
Ejemplo n.º 3
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            pi_logits = fc(h, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h, 'q', nact)

        # could change this to use self.pi instead
        a = sample(tf.nn.softmax(pi_logits))
        self.initial_state = []  # not stateful
        self.X = X
        self.pi = pi  # actual policy params now
        self.pi_logits = pi_logits
        self.q = q
        self.vf = q

        def step(ob, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0 = sess.run([a, pi], {X: ob})
            return a0, pi0, []  # dummy state

        def out(ob, *args, **kwargs):
            pi0, q0 = sess.run([pi, q], {X: ob})
            return pi0, q0

        def act(ob, *args, **kwargs):
            return sess.run(a, {X: ob})

        self.step = step
        self.out = out
        self.act = act
Ejemplo n.º 4
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 nlstm=256):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  # states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)

            # lstm
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, state, mask, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
            return a0, pi0, s

        self.step = step
Ejemplo n.º 5
0
    def network_fn(X):
        h = tf.layers.flatten(X)
        for i in range(num_layers):
            h = fc(h,
                   'mlp_fc{}'.format(i),
                   nh=num_hidden,
                   init_scale=np.sqrt(2))
            if layer_norm:
                h = tf.contrib.layers.layer_norm(h, center=True, scale=True)
            h = activation(h)

        return h
Ejemplo n.º 6
0
    def network_fn(X):
        h = tf.cast(X, tf.float32) / 255.

        activ = tf.nn.relu
        h = activ(
            conv(h,
                 'c1',
                 nf=8,
                 rf=8,
                 stride=4,
                 init_scale=np.sqrt(2),
                 **conv_kwargs))
        h = activ(
            conv(h,
                 'c2',
                 nf=16,
                 rf=4,
                 stride=2,
                 init_scale=np.sqrt(2),
                 **conv_kwargs))
        h = conv_to_fc(h)
        h = activ(fc(h, 'fc1', nh=128, init_scale=np.sqrt(2)))
        return h
Ejemplo n.º 7
0
    def __init__(self,
                 env,
                 observations,
                 latent,
                 dones,
                 states=None,
                 estimate_q=False,
                 vf_latent=None,
                 sess=None):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """
        self.X = observations
        self.dones = dones
        self.pdtype = make_pdtype(env.action_space)
        self.states = states
        self.sess = sess or tf.get_default_session()

        vf_latent = vf_latent if vf_latent is not None else latent

        with tf.variable_scope('policy'):
            latent = tf.layers.flatten(latent)
            # Based on the action space, will select what probability distribution type
            self.pd, self.pi = self.pdtype.pdfromlatent(latent,
                                                        init_scale=0.01)

            with tf.variable_scope('sample_action'):
                self.action = self.pd.sample()

            with tf.variable_scope('negative_log_probability'):
                # Calculate the neg log of our probability
                self.neglogp = self.pd.neglogp(self.action)

        with tf.variable_scope('value'):
            vf_latent = tf.layers.flatten(vf_latent)

            if estimate_q:
                assert isinstance(env.action_space, gym.spaces.Discrete)
                self.q = fc(vf_latent, 'q', env.action_space.n)
                self.value = self.q
            else:
                vf_latent = tf.layers.flatten(vf_latent)
                self.value = fc(vf_latent, 'value', 1, init_scale=0.01)
                self.value = self.value[:, 0]

        self.step_input = {
            'observations': observations,
            'dones': self.dones,
        }

        self.step_output = {
            'actions': self.action,
            'values': self.value,
            'neglogpacs': self.neglogp,
        }
        if self.states:
            self.initial_state = np.zeros(self.states['current'].get_shape())
            self.step_input.update({'states': self.states['current']})
            self.step_output.update({
                'states': self.states['current'],
                'next_states': self.states['next']
            })
        else:
            self.initial_state = None