예제 #1
0
 def pd(self, obs):
     mb_input = obs
     name = self.name
     activ = self.activ
     nh = self.nh
     with tf.variable_scope(name):
         pi_1 = activ(fc(mb_input, scope='pi_1', nh=nh, init_scale=np.sqrt(2)))
         pi_2 = activ(fc(pi_1, scope='pi_2', nh=nh, init_scale=np.sqrt(2)))
         pd, _ = self.pdtype.pdfromlatent(pi_2)
     return pd
예제 #2
0
 def __init__(self, obs, pi, trajectories, pdtype, name='Model_Based',
              nh=64, activ=tf.nn.tanh):
     self.pdtype = pdtype
     mb_input = tf.concat([obs, pi, trajectories], axis=-1)
     with tf.variable_scope(name):
         pi_1 = activ(fc(mb_input, scope='pi_1', nh=nh, init_scale=np.sqrt(2)))
         pi_2 = activ(fc(pi_1, scope='pi_2', nh=nh, init_scale=np.sqrt(2)))
         vf_1 = activ(fc(mb_input, scope='vf_1', nh=nh,init_scale=np.sqrt(2)))
         vf_2 = activ(fc(vf_1, scope='vf_2', nh=1, init_scale=np.sqrt(2)))
     self.pd, self.pi = self.pdtype.pdfromlatent(pi_2)
     self.act = self.pd.sample()
     self.vf = vf_2
예제 #3
0
def nature_cnn(unscaled_images, **conv_kwargs):
    """
    CNN from Nature paper.
    """
    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
    activ = tf.nn.relu
    h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
                   **conv_kwargs))
    h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
    h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs))
    h3 = conv_to_fc(h3)
    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
예제 #4
0
    def __init__(self, obs, nactions, actions, nobs, rewards, policy,
                 trajectory_length=8, name='Env_Model', LR=tf.constant(1e-4),
                 nh=64, nout=64, vcoef=0.5, activ = tf.nn.tanh, max_grad=0.5):
        all_trajectories = []
        all_rewards = []
        # rollout graph
        for action in range(nactions):
            action_list = [action]
            rollout_obs = [obs]
            rollout_rews = []
        
            for t in range(trajectory_length):
                x_in = tf.concat(rollout_obs[t], tf.one_hot(action_list[t],
                                 nactions))
                with tf.variable_scope(name):                 
                    ns_1 = activ(fc(x_in, 'ns_1', nh, init_scale=np.sqrt(2)))
                    ns_2 = tf.nn.sigmoid(fc(ns_1, 'ns_2', nout, init_scale=np.sqrt(2)))
                    vf_1 = activ(fc(x_in, 'vf_1', nh, init_scale=np.sqrt(2)))
                    vf_2 = activ(fc(vf_1, 'vf_1', 1, init_scale=np.sqrt(2)))
                rollout_obs.append(ns_2)
                rollout_rews.append(vf_2)
                action = self.pdtype.pdfromlatent(rollout_obs[t+1]).sample()
                action_list.append(action)
            
            all_trajectories.append(tf.stack(rollout_obs[1:]))
            all_rewards.append(tf.stack(rollout_rews))

        # training graph            
        with tf.variable_scope(name):
            X_IN = tf.concat(obs, tf.one_hot(actions, nactions))
            ns_1 = activ(fc(X_IN, 'ns_1', nh, init_scale=np.sqrt(2)))
            ns_2 = tf.nn.sigmoid(fc(ns_1, 'ns_2', nout, init_scale=np.sqrt(2)))
            vf_1 = activ(fc(X_IN, 'vf_1', nh, init_scale=np.sqrt(2)))
            vf_2 = activ(fc(vf_1, 'vf_1', 1, init_scale=np.sqrt(2)))
        
        prediction_loss = tf.mean(tf.sum(tf.square(ns_2 - nobs), axis=-1))
        value_loss = tf.mean(tf.sum(tf.square(vf_2 - rewards), axis=-1))
        env_loss = prediction_loss + vcoef * value_loss
        optimizer = tf.train.AdamOptimizer(LR)
        params = tf.trainable_variables()
        grads = tf.gradients(env_loss, params)
        if max_grad is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad)
        grads = list(zip(grads, params))
        self.trainer = optimizer.apply_gradients(grads)
예제 #5
0
 def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
     mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias, r=self.r)
     logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.constant_initializer(-1.))
     pdparam = tf.concat([mean, tf.zeros_like(mean) + logstd], axis=-1)
     return self.pdfromflat(pdparam), mean
예제 #6
0
 def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
     pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias, r=self.r)
     return self.pdfromflat(pdparam), pdparam
예제 #7
0
 def __call__(self, tin):
     embed = tf.layers.batch_normalization(tin, reuse=tf.AUTO_REUSE, **self.batch_norm_kwargs)
     for i in range(self.layers):
         embed = self.activ(fc(embed, 'em_fc'+str(i), nh=self.nh, init_scale=np.sqrt(2), r=self.r))
     tout = embed
     return tout
예제 #8
0
 def __call__(self, tin):
     embed = nature_cnn(tin, **self.conv_kwargs)
     for i in range(self.layers):
         embed = self.activ(fc(embed, 'em_fc'+str(i), nh=self.nh, init_scale=np.sqrt(2)))
     tout = embed
     return tout
예제 #9
0
 def __call__(self, tin):
     embed = tin
     for i in range(self.layers):
         embed = self.activ(fc(embed, 'em_fc'+str(i), nh=self.nh, init_scale=np.sqrt(2), r=self.r))
     tout=embed
     return tout
예제 #10
0
    def __init__(self,
                 mgoal,
                 state,
                 pstate,
                 pdtype=None,
                 nhist=4,
                 nin=32,
                 ngoal=16,
                 nembed=8,
                 manager=False,
                 nh=64,
                 activ=tf.nn.relu,
                 name=1,
                 nbatch=1,
                 neplength=1e2,
                 cell=tf.contrib.rnn.LSTMCell,
                 val=False,
                 recurrent=1):
        self.mgoal = mgoal[:, :, :nin]
        self.state = state
        self.pstate = pstate
        #state = tf.concat([self.mgoal, self.state], axis=-1)
        nph = nh
        self.manager = manager
        self.name = name
        nout = ngoal if manager else nh
        self.pdtype = pdtype

        with tf.variable_scope("level" + str(self.name)):
            em_h2 = activ(
                fc(state, 'em_fc2', nh=nout, init_scale=np.sqrt(2), r=True))
            embed_goal = activ(
                fc(self.mgoal, 'embed', nh=nph, init_scale=np.sqrt(2), r=True))

            cell = cell(nh, state_is_tuple=False)
            a_h1, nstate = tf.nn.dynamic_rnn(cell,
                                             inputs=state,
                                             initial_state=pstate[:, 0, :])
            c_h1 = activ(a_h1)
            pi_h2 = activ(
                fc(c_h1, 'pi_fc2', nh=nph, init_scale=np.sqrt(2), r=True))
            vf_h2 = activ(
                fc(c_h1, 'vf_fc2', nh=nh, init_scale=np.sqrt(2), r=True))
            vout = tf.nn.tanh(fc(vf_h2, 'vf', 1, r=True))[:, :, 0]

            pout = embed_goal + pi_h2
            #pout = pi_h2

            self.pd, self.pi = self.pdtype.pdfromlatent(pout, init_scale=0.01)
            aout = self.pd.sample()
            neglogpout = self.pd.neglogp(aout)

        self.nstate = nstate
        self.aout = aout
        self.nlp = neglogpout

        def bcs(state, spad, gpad, nhist):
            rew = tf.zeros(shape=(nbatch, neplength), dtype=tf.float32)
            for t in range(nhist):
                svec = state - spad[:, nhist - t - 1:-(t + 1), :]
                gvec = gpad[:, nhist - t - 1:-(t + 1), :]
                nsv = tf.nn.l2_normalize(svec, axis=-1)
                ngv = tf.nn.l2_normalize(gvec, axis=-1)
                rew += tf.reduce_sum(tf.multiply(nsv, ngv), axis=-1)
            return rew

        def fcs(fvec, gvec, nhist):
            nfv = tf.nn.l2_normalize(fvec, axis=-1)
            ngv = tf.nn.l2_normalize(gvec, axis=-1)
            sim = tf.reduce_sum(tf.multiply(nfv, ngv), axis=-1)
            return sim

        self.vf = vout
        if self.manager:
            pad = tf.constant([[0, 0], [nhist, 0], [0, 0]])
            spad = tf.pad(em_h2, pad, "CONSTANT")
            gpad = tf.pad(aout, pad, "CONSTANT")

            self.inr = 1 / nhist * tf.stop_gradient(
                bcs(em_h2, spad, gpad, nhist))

            lstate = em_h2[:, -1, :]
            rep = tf.reshape(tf.tile(lstate, [nhist, 1]),
                             (nbatch, nhist, nout))
            spadf = tf.concat([em_h2, rep], axis=1)
            self.fvec = spadf[:, nhist:, ] - em_h2
            self.traj_sim = fcs(self.fvec, aout, nhist)
예제 #11
0
    def __init__(self,
                 mgoal,
                 state,
                 pstate,
                 pdtype=None,
                 nhist=4,
                 nin=32,
                 ngoal=16,
                 recurrent=0,
                 nembed=8,
                 manager=False,
                 nh=64,
                 activ=tf.nn.relu,
                 name=1,
                 nbatch=1e3,
                 val=True,
                 feed_fvec=None):
        '''
        INPUTS:
            mgoal - goal tensor of supervisor
            state - observation tensor post-embedding
            pstate - recurrent state tensor, ignored in this call
            mfvec - 
        '''
        self.mgoal = mgoal[:, :nin]
        self.state = state
        #state = tf.concat([self.mgoal, self.state], axis=-1)
        nph = nh
        self.manager = manager
        self.name = name
        self.initial_state = None
        nout = ngoal if manager else nh
        self.nout = nout
        self.pdtype = pdtype

        with tf.variable_scope("level" + str(self.name)):
            em_h2 = fc(state, 'em_fc2', nh=nout, init_scale=np.sqrt(2))
            embed_goal = fc(self.mgoal, 'embed', nh=nh, init_scale=np.sqrt(2))
            pi_h1 = activ(fc(em_h2, 'pi_fc1', nh=nh, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=nh, init_scale=np.sqrt(2)))
            vf_h1 = activ(fc(state, 'vf_fc1', nh=nh, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=nh, init_scale=np.sqrt(2)))

            pout = embed_goal * pi_h2

            vout = tf.nn.tanh(fc(vf_h2, 'vf', 1))[:, 0]
            #pout = pi_h2

            self.pd, self.pi = self.pdtype.pdfromlatent(pout, init_scale=0.01)
            aout = self.pd.sample()
            neglogpout = self.pd.neglogp(aout)

        self.aout = aout
        self.nlp = neglogpout
        #print(self.nlp)
        self.nstate = None

        def bcs(state, spad, gpad, nhist):
            rew = tf.fill([nbatch], 0.0)
            for t in range(nhist):
                svec = state - spad[nhist - t - 1:-(t + 1), :]
                gvec = gpad[nhist - t - 1:-(t + 1), :]
                nsv = tf.nn.l2_normalize(svec, axis=-1)
                ngv = tf.nn.l2_normalize(gvec, axis=-1)
                rew += tf.reduce_sum(tf.multiply(nsv, ngv), axis=-1)
            return rew

        def sparse_bcs(state, spad, gpad, nhist, axis=0):
            rew = tf.fill([nbatch], 0.0)
            for t in range(nhist):
                if axis == 1:
                    svec = state - spad[:, nhist - t - 1:-(t + 1), :]
                    gvec = gpad[:, nhist - t - 1:-(t + 1), :]
                else:
                    svec = state - spad[nhist - t - 1:-(t + 1), :]
                    gvec = gpad[nhist - t - 1:-(t + 1), :]
                delta_gs = tf.to_float(
                    tf.equal(
                        tf.reduce_mean(tf.to_float(tf.equal(svec, gvec)),
                                       axis=-1), 1.))
                zero_mask = tf.to_float(
                    tf.equal(
                        tf.reduce_mean(tf.to_float(
                            tf.equal(tf.zeros_like(gvec), gvec)),
                                       axis=-1), 1.))
                delta_gs *= (1. - zero_mask)
                #rew = tf.Print(rew, [delta_gs, tf.shape(delta_gs)])
                rew += delta_gs
                #rew += tf.to_float(tf.equal(tf.reduce_mean(tf.to_float(tf.equal(svec, gvec)), axis=-1), 1.))
            #print("sparse_bcs shape: {}".format(rew.get_shape()))
            return rew

        def fcs(fvec, gvec, nhist):
            nfv = tf.nn.l2_normalize(fvec, axis=-1)
            ngv = tf.nn.l2_normalize(gvec, axis=-1)
            sim = tf.reduce_sum(tf.multiply(nfv, ngv), axis=-1)
            return sim

        self.vf = vout
        if self.manager:
            pad = tf.constant([[nhist, 0], [0, 0]])
            spad = tf.pad(em_h2, pad, "CONSTANT")
            gpad = tf.pad(aout, pad, "CONSTANT")

            self.inr = 1 / nhist * tf.stop_gradient(
                bcs(em_h2, spad, gpad, nhist))

            lstate = em_h2[-1, :]
            rep = tf.reshape(tf.tile(lstate, tf.constant([nhist])),
                             (nhist, nout))
            spadf = tf.concat([em_h2, rep], axis=0)
            self.fvec = spadf[nhist:, ] - em_h2
            self.train_nlp = self.pd.neglogp(
                tf.nn.l2_normalize(tf.stop_gradient(self.fvec), axis=-1))
            self.loss_nlp = self.pd.neglogp(
                tf.nn.l2_normalize(tf.stop_gradient(feed_fvec), axis=-1))
            self.traj_sim = fcs(self.fvec, aout, nhist)