예제 #1
0
파일: policies.py 프로젝트: stjordanis/disc
    def __init__(self,
                 env,
                 observations,
                 latent,
                 estimate_q=False,
                 vf_latent=None,
                 sess=None,
                 **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        vf_latent = vf_latent if vf_latent is not None else latent

        vf_latent = tf.layers.flatten(vf_latent)
        latent = tf.layers.flatten(latent)

        # Based on the action space, will select what probability distribution type
        self.pdtype = make_pdtype(env.action_space)

        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)

        # Take an action
        self.action = self.pd.sample()

        # Calculate the neg log of our probability
        self.neglogp = self.pd.neglogp(self.action)
        self.sess = sess or tf.get_default_session()

        if estimate_q:
            assert isinstance(env.action_space, gym.spaces.Discrete)
            self.q = fc(vf_latent, 'q', env.action_space.n)
            self.vf = self.q
        else:
            self.vf = fc(vf_latent, 'vf', 1)
            self.vf = self.vf[:, 0]
예제 #2
0
 def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
     pdparam = fc(latent_vector,
                  'pi',
                  self.ncat,
                  init_scale=init_scale,
                  init_bias=init_bias)
     return self.pdfromflat(pdparam), pdparam
예제 #3
0
파일: models.py 프로젝트: stjordanis/disc
def nature_cnn(unscaled_images, **conv_kwargs):
    """
    CNN from Nature paper.
    """
    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
    activ = tf.nn.relu
    h = activ(
        conv(scaled_images,
             'c1',
             nf=32,
             rf=8,
             stride=4,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h2 = activ(
        conv(h,
             'c2',
             nf=64,
             rf=4,
             stride=2,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h3 = activ(
        conv(h2,
             'c3',
             nf=64,
             rf=3,
             stride=1,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h3 = conv_to_fc(h3)
    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
예제 #4
0
파일: policies.py 프로젝트: aausch/TRPPO
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):  # pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
            processed_x = tf.layers.flatten(processed_x)
            pi_h1 = activ(
                fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            vf_h1 = activ(
                fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:, 0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        def step_policyflat(ob, *_args, **_kwargs):
            a, v, neglogp, polciyflat = sess.run(
                [a0, vf, neglogp0, self.pd.flat], {X: ob})
            return a, v, self.initial_state, neglogp, polciyflat

        def step_test(ob, *_args, **_kwargs):
            a = sess.run([self.pd.mean], {X: ob})
            return a

        self.X = X
        self.vf = vf
        self.step = step
        self.step_policyflat = step_policyflat
        self.value = value
        self.step_test = step_test
예제 #5
0
 def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
     mean = fc(latent_vector,
               'pi',
               self.size,
               init_scale=init_scale,
               init_bias=init_bias)
     logstd = tf.get_variable(name='logstd',
                              shape=[1, self.size],
                              initializer=tf.zeros_initializer())
     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
     return self.pdfromflat(pdparam), mean
예제 #6
0
 def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
     mean = fc(latent_vector,
               'pi',
               self.size,
               init_scale=init_scale,
               init_bias=init_bias)
     # logstd = tf.maximum(np.log(0.2)*tf.ones_like(mean,dtype='float32'),tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer()))
     logstd = tf.get_variable(name='pi/logstd',
                              shape=[1, self.size],
                              initializer=tf.zeros_initializer())
     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
     return self.pdfromflat(pdparam), mean
예제 #7
0
파일: models.py 프로젝트: stjordanis/disc
    def network_fn(X):
        h = tf.layers.flatten(X)
        for i in range(num_layers):
            h = fc(h,
                   'mlp_fc{}'.format(i),
                   nh=num_hidden,
                   init_scale=np.sqrt(2))
            if layer_norm:
                h = tf.contrib.layers.layer_norm(h, center=True, scale=True)
            h = activation(h)

        return h
예제 #8
0
파일: policies.py 프로젝트: aausch/TRPPO
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 **conv_kwargs):  # pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def step_test(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run(
                [tf.argmax(self.pd.logits, axis=-1), vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def step_policyflat(ob, *_args, **_kwargs):
            a, v, neglogp, polciyflat = sess.run(
                [a0, vf, neglogp0, self.pd.logits], {X: ob})
            # a, v, self.initial_state, neglogp = self.step(ob, *_args, **_kwargs)
            # pa = np.exp(-neglogp)
            return a, v, self.initial_state, neglogp, polciyflat

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.step_test = step_test
        self.step_policyflat = step_policyflat
        self.value = value
예제 #9
0
파일: models.py 프로젝트: stjordanis/disc
    def network_fn(X):
        h = tf.cast(X, tf.float32) / 255.

        activ = tf.nn.relu
        h = activ(
            conv(h,
                 'c1',
                 nf=8,
                 rf=8,
                 stride=4,
                 init_scale=np.sqrt(2),
                 **conv_kwargs))
        h = activ(
            conv(h,
                 'c2',
                 nf=16,
                 rf=4,
                 stride=2,
                 init_scale=np.sqrt(2),
                 **conv_kwargs))
        h = conv_to_fc(h)
        h = activ(fc(h, 'fc1', nh=128, init_scale=np.sqrt(2)))
        return h