def _lstm(self,
              obs,
              states,
              masks,
              nlstm,
              ac_space,
              nbatch,
              nsteps,
              reuse=False):
        # obs: nbatch * ob_shape
        # states: num_env * (2xnlstm)
        # masks: nbatch
        # TODO: fix dimensions
        num_env = nbatch // nsteps
        nh, nw, nc = obs.shape[1:]
        ob_shape = [nsteps, nh, nw, nc]
        nact = ac_space.n
        with tf.variable_scope('lstm', reuse=reuse):
            h = feature_net(obs)
            xs = batch_to_seq(h, num_env, nsteps)
            ms = batch_to_seq(masks, num_env, nsteps)
            h5, snew = lstm(xs, ms, states, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)

        self.vpred = vf[:, 0]

        self.pdtype = pdtype = make_pdtype(ac_space)
        self.pd = pdtype.pdfromflat(pi)

        self.snew = snew
Exemple #2
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
Exemple #3
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False,
                 scope_name="model"):
        nenv = nbatch // nsteps

        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope(scope_name, reuse=reuse):
            h = conv(tf.cast(X, tf.float32) / 255.,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x: x)
            vf = fc(h5, 'v', 1, act=lambda x: x)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Exemple #4
0
    def network_fn(X, nenv=1):

        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        ob_g, ob_l = tf.split(X, 2, axis=1)
        ob_g = tf.squeeze(ob_g, axis=1) - 128.0
        ob_l = tf.squeeze(ob_l, axis=1) - 128.0

        # Conv layer
        net_g = vggm1234(ob_g)
        net_l = vggm1234(ob_l)
        feat = tf.concat([net_g, net_l], 1)

        # LSTM
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

        xs = batch_to_seq(feat, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return (feat, h), {
            'S': S,
            'M': M,
            'state': snew,
            'initial_state': initial_state
        }
Exemple #5
0
    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        h = tf.layers.flatten(X)

        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  # states

        xs = batch_to_seq(h, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope="lnlstm", nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope="lstm", nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return h, {
            "S": S,
            "M": M,
            "state": snew,
            "initial_state": initial_state
        }
Exemple #6
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_batch,
                 n_steps,
                 n_lstm=256,
                 reuse=False,
                 layer_norm=False,
                 **kwargs):
        super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_batch,
                                         n_steps, n_lstm, reuse)
        with tf.variable_scope("model", reuse=reuse):
            extracted_features = nature_cnn(self.obs_ph, **kwargs)
            input_sequence = batch_to_seq(extracted_features, self.n_env,
                                          n_steps)
            masks = batch_to_seq(self.masks_ph, self.n_env, n_steps)
            rnn_output, self.snew = lstm(input_sequence,
                                         masks,
                                         self.states_ph,
                                         'lstm1',
                                         n_hidden=n_lstm,
                                         layer_norm=layer_norm)
            rnn_output = seq_to_batch(rnn_output)
            value_fn = linear(rnn_output, 'v', 1)
            self.proba_distribution, self.policy = self.pdtype.proba_distribution_from_latent(
                rnn_output)

        self._value = value_fn[:, 0]
        self.action = self.proba_distribution.sample()
        self.neglogp = self.proba_distribution.neglogp(self.action)
        self.initial_state = np.zeros((self.n_env, n_lstm * 2),
                                      dtype=np.float32)
        self.value_fn = value_fn
Exemple #7
0
def q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma):
    """
    Calculates q_retrace targets;
    vs: nenv, nsteps (takes obs_{t+1} and g_t as inputs)

    :param R: Rewards
    :param D: Dones
    :param q_i: Q values for actions taken
    :param v: V values
    :param rho_i: Importance weight for each action
    :return: Q_retrace values
    """
    rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), nenvs, nsteps, True)  # list of len steps, shape [nenvs]
    rs = batch_to_seq(R, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
    ds = batch_to_seq(D, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
    q_is = batch_to_seq(q_i, nenvs, nsteps, True)
    vs = batch_to_seq(v, nenvs, nsteps, True)   # (by lizn, only the next state value)
    v_final = vs[-1]
    qret = v_final
    qrets = []
    for i in range(nsteps - 1, -1, -1):
        check_shape([qret, ds[i], rs[i], rho_bar[i], q_is[i], vs[i]], [[nenvs]] * 6)
        qret = rs[i] + gamma * qret * (1.0 - ds[i])
        qrets.append(qret)
        if i > 0:
            qret = (rho_bar[i] * (qret - q_is[i])) + vs[i-1]
    qrets = qrets[::-1]
    qret = seq_to_batch(qrets, flat=True)
    return qret
Exemple #8
0
    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        h = nature_cnn(X, **conv_kwargs)

        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

        xs = batch_to_seq(h, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return h, {
            'S': S,
            'M': M,
            'state': snew,
            'initial_state': initial_state
        }
Exemple #9
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)

            # lstm
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, state, mask, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
            return a0, pi0, s

        self.step = step
Exemple #10
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        X, processed_x = observation_input(ob_space, nbatch)
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
Exemple #11
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_stack,
                 reuse=False,
                 n_lstm=256):
        super(AcerLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env,
                                             n_steps, n_stack, reuse, n_lstm)
        with tf.variable_scope("model", reuse=reuse):
            extracted_features = nature_cnn(self.obs_ph)

            # lstm
            input_seq = batch_to_seq(extracted_features, n_env, n_steps)
            masks = batch_to_seq(self.masks_ph, n_env, n_steps)
            rnn_output, self.snew = lstm(input_seq,
                                         masks,
                                         self.states_ph,
                                         'lstm1',
                                         n_hidden=n_lstm)
            rnn_output = seq_to_batch(rnn_output)

            pi_logits = linear(rnn_output, 'pi', self.n_act, init_scale=0.01)
            policy = tf.nn.softmax(pi_logits)
            q_value = linear(rnn_output, 'q', self.n_act)

        self.action = sample(
            pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((n_env, n_lstm * 2), dtype=np.float32)
        self.policy = policy  # actual policy params now
        self.q_value = q_value
Exemple #12
0
    def network_fn(X, nenv=1):
        print("")
        print("IN HERE LSTM and this is X ",str(X))
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        h = tf.layers.flatten(X)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
        #T = tf.get_variable(name='init', shape=[1, 2], initializer=tf.constant_initializer(1)) # task desciptor

        xs = batch_to_seq(h, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)

        ## TODO:  need to change initialization of state!
        initial_state = np.zeros(S.shape.as_list(), dtype=float)


        print("")
        print("HHHHH ",str(S.shape.as_list()))
        print(nenv)

        #initial_state = utils.fc(T,'pi_init', [nenv,48], init_scale=0.01, init_bias=0.01)
        #initial_state = tf.get_variable(name='init_state', shape=initial_state.shape, initializer=tf.zeros_initializer(), trainable=True) # task desciptor

        return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
Exemple #13
0
    def network_fn(X, nenv=1):
        # TODO(akadian): modify the below code to adapt for depth
        nbatch = X[0].shape[0]
        nsteps = nbatch // nenv

        if X[0].shape[3] == 3:
            h = nature_cnn(X[0], **conv_kwargs)  # rgb
        elif X[0].shape[3] == 1:
            h = depth_cnn(X[0], **conv_kwargs)  # depth
        else:
            raise ValueError

        h = tf.concat([h, X[1]], 1)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states

        xs = batch_to_seq(h, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
Exemple #14
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)

            # lstm
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, state, mask, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
            return a0, pi0, s

        self.step = step
Exemple #15
0
def q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma):
    """
    Calculates q_retrace targets

    :param R: Rewards
    :param D: Dones
    :param q_i: Q values for actions taken
    :param v: V values
    :param rho_i: Importance weight for each action
    :return: Q_retrace values
    """
    rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), nenvs, nsteps, True)  # list of len steps, shape [nenvs]
    rs = batch_to_seq(R, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
    ds = batch_to_seq(D, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
    q_is = batch_to_seq(q_i, nenvs, nsteps, True)
    vs = batch_to_seq(v, nenvs, nsteps + 1, True)
    v_final = vs[-1]
    qret = v_final
    qrets = []
    for i in range(nsteps - 1, -1, -1):
        check_shape([qret, ds[i], rs[i], rho_bar[i], q_is[i], vs[i]], [[nenvs]] * 6)
        qret = rs[i] + gamma * qret * (1.0 - ds[i])
        qrets.append(qret)
        qret = (rho_bar[i] * (qret - q_is[i])) + vs[i]
    qrets = qrets[::-1]
    qret = seq_to_batch(qrets, flat=True)
    return qret
Exemple #16
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=64, reuse=False):
        nenv = nbatch // nsteps
        print(f'{nlstm}')
        ob_shape = (nbatch,) + ob_space.shape
        actdim = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            # h1 = fc(X, 'fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
            activ = tf.tanh
            h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            xs = batch_to_seq(h1, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h2, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h2 = seq_to_batch(h2)
            pi = fc(h2, 'pi', actdim, init_scale=0.01)
            logstd = tf.get_variable(name="logstd", shape=[1, actdim],
                initializer=tf.zeros_initializer())

            h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(h2, 'vf', 1)

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        a0 = self.pd.sample()
        v0 = vf[:, 0]
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        def get_act(ob, state, mask):
            a = sess.run(a0, {X:ob, S:state, M:mask})
            return a

        def get_mean(ob, state, mask):
            a, state_new = sess.run([pi, snew], {X:ob, S:state, M:mask})
            return a, state_new


        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
        self.act = get_act
        self.mean = get_mean
Exemple #17
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
        super().__init__(sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=reuse)
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        nlstm = self.lstm_units
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            X = tf.cast(X, tf.float32)
            with tf.variable_scope("Towers", reuse=reuse):
                with tf.variable_scope("tower_1"):
                    tower1 = tf.layers.conv2d(inputs=X, filters=64, kernel_size=(3, 3), strides=(1, 1),
                                              padding='SAME', kernel_initializer=tf.initializers.variance_scaling)
                    tower1 = tf.layers.conv2d(inputs=tower1, filters=32, kernel_size=(3, 3), strides=(1, 1),
                                              padding='SAME', kernel_initializer=tf.initializers.variance_scaling)
                    tower1 = tf.layers.max_pooling2d(tower1, pool_size=(22, 80), strides=(22, 80))

                with tf.variable_scope("tower_2"):
                    tower2 = tf.layers.max_pooling2d(X, pool_size=(2, 2), strides=(2, 2))
                    for _ in range(self.depth):
                        tower2 = tf.layers.conv2d(inputs=tower2, filters=32, kernel_size=(3, 3), strides=(1, 1),
                                                  padding='SAME', kernel_initializer=tf.initializers.variance_scaling)
                        tower2 = tf.nn.relu(tower2)
                    tower2 = tf.layers.max_pooling2d(tower2, pool_size=(11, 40), strides=(11, 40))

                with tf.variable_scope("tower_3"):
                    tower3 = tf.layers.max_pooling2d(X, pool_size=(3, 6), strides=(3, 6), padding='SAME')
                    for _ in range(self.depth):
                        tower3 = tf.layers.conv2d(inputs=tower3, filters=32, kernel_size=(3, 3), strides=(1, 1),
                                                  padding='SAME', kernel_initializer=tf.initializers.variance_scaling)
                        tower3 = tf.nn.relu(tower3)
                    tower3 = tf.layers.max_pooling2d(tower3, pool_size=(8, 14), strides=(8, 14), padding='SAME')

                concat = tf.concat([tower1, tower2, tower3], axis=-1)

            # lstm
            xs = batch_to_seq(concat, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        self.a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        self.snew = snew
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q
        self.sess = sess
Exemple #18
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=128, reuse=False):
        scope = "model"
        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape, name="observations") #obs
        M = tf.placeholder(tf.float32, [nbatch], name="mask") #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2], name="states") #states
        with tf.variable_scope(scope, reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x:x)
            vf = fc(h5, 'v', 1, act=lambda x:x)

            trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
            self._saver = tf.train.Saver(trainable_vars)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
            return a, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        def save(path, name):
            try:
                os.makedirs(path)
            except FileExistsError:
                pass
            self._saver.save(sess, path+name)

        def load(path, name):
            if os.path.exists(path+name+'.index'):
                self._saver.restore(sess, path+name)
            else:
                tf.logging.warn('Failed restoring vars from %s' % path)

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
        self.save = save
        self.load = load
Exemple #19
0
            def network_fn(X, nenv=1):
                nbatch = X.shape[0]
                nsteps = nbatch // nenv

                h = X
                with tf.variable_scope('mlp_in', reuse=tf.AUTO_REUSE):
                    for i in range(num_layers_in):
                        h = fc(h,
                               'mlp_in_fc{}'.format(i),
                               nh=num_hidden_in,
                               init_scale=np.sqrt(2))
                        if layer_norm_in:
                            h = tf.contrib.layers.layer_norm(h,
                                                             center=True,
                                                             scale=True)
                        h = activation(h)

                h = tf.layers.flatten(X)

                M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
                S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

                xs = batch_to_seq(h, nenv, nsteps)
                ms = batch_to_seq(M, nenv, nsteps)

                if layer_norm_lstm:
                    h5, snew = utils.lnlstm(xs,
                                            ms,
                                            S,
                                            scope='lnlstm',
                                            nh=nlstm)
                else:
                    h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

                h = seq_to_batch(h5)

                with tf.variable_scope('mlp_out', reuse=tf.AUTO_REUSE):
                    for i in range(num_layers_out):
                        h = fc(h,
                               'mlp_out_fc{}'.format(i),
                               nh=num_hidden_out,
                               init_scale=np.sqrt(2))
                        if layer_norm_out:
                            h = tf.contrib.layers.layer_norm(h,
                                                             center=True,
                                                             scale=True)
                        h = activation(h)

                initial_state = np.zeros(S.shape.as_list(), dtype=float)

                return h, {
                    'S': S,
                    'M': M,
                    'state': snew,
                    'initial_state': initial_state
                }
Exemple #20
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 nlstm=256):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32) / 255.,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))

            # lstm
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, act=lambda x: x, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact, act=lambda x: x)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, state, mask, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
            return a0, pi0, s

        self.step = step


# For Mujoco. Taken from PPOSGD
Exemple #21
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 create_additional=True,
                 nlstm=256):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            h, self.dropout_assign_ops = choose_cnn(processed_x)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            if (create_additional):
                vf = fc(h5, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        a0 = self.pd.sample()
        if (create_additional):
            neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            if (create_additional):
                a, v, s, neglogp = sess.run([a0, vf, snew, neglogp0], {
                    X: ob,
                    S: state,
                    M: mask
                })
            else:
                a, s = sess.run([a0, snew], {X: ob, S: state, M: mask})
                v = np.zeros_like(a)
                neglogp = np.zeros_like(a)
            return a, v, s, neglogp

        def value(ob, state, mask):
            return sess.run(vf, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        if (create_additional):
            self.vf = vf
            self.value = value
        self.step = step
Exemple #22
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 size_mem=256,
                 reuse=False):
        nenv = nbatch // nsteps

        # nh, nw, nc = ob_space.shape
        # ob_shape = (nbatch, nh, nw, nc)
        ob_shape = (nbatch, ) + ob_space.shape
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, size_mem * 2])  # states
        with tf.variable_scope("model", reuse=reuse):
            h = self.preprocess(X)
            h = fc(h, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = self.memory_fn(xs, ms, S, nh=size_mem)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x: x)
            vf = fc(h5, 'v', 1, act=lambda x: x)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, size_mem * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Exemple #23
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=8, reuse=False):
        # assume ob_space, ac_space to be flattned
        # e.g. original action_space (3,2,3) -> new action_space (36)
        nenv = nbatch // nsteps
        print ("envs and steps and batch:", nenv, nsteps, nbatch)
        #nh, nw, nc = ob_space.shape
        #ob_shape = (nbatch, nh, nw, nc)
        ob_shape = (nbatch,) + ob_space.shape
        #nact = ac_space.high.size
        pdtype = make_pdtype(ac_space)
        X = tf.placeholder(tf.float32, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            #h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            #h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            #h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            #h3 = conv_to_fc(h3)
            #h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            h4 = fc(X, 'fc1', nh=16, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pdparam = fc(h5, 'pi', pdtype.param_shape()[0], act=lambda x:x)
            vf = fc(h5, 'v', 1, act=lambda x:x)
            #logstd = tf.get_variable(name="logstd", shape=[1, nact],
            #    initializer=tf.zeros_initializer())

        #pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = pdtype #make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
Exemple #24
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False,
                 param=None):
        nenv = nbatch // nsteps

        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  # states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Exemple #25
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=64, reuse=False):
        nenv = nbatch // nsteps
        ob_shape = add_batch_dimension(ob_space.shape, nbatch)
        nact = ac_space.n
        X = tf.placeholder(tf.float32, ob_shape, name="X") #obs
        M = tf.placeholder(tf.float32, [nbatch], name="M") #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2], name="S") #states
        with tf.variable_scope("model", reuse=reuse):
            xs = batch_to_seq(X, nenv, nsteps) # Observation sequences
            ms = batch_to_seq(M, nenv, nsteps) # Done sequences
            h0, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h0 = seq_to_batch(h0)
            h0 = tf.concat([h0,X],1)
            # Policy
            h1 = fc(h0, 'pi_fc1', nh=128, init_scale=np.sqrt(2), act=tf.nn.relu)
            pi = fc(h1, 'pi', nact, act=tf.tanh, init_scale=0.01)
            # Value function
            h1 = fc(h0, 'vf_fc1', nh=128, init_scale=np.sqrt(2), act=tf.nn.relu)
            vf = fc(h1, 'vf', 1, act=lambda x:x)
            # Current policy variance
            logstd = tf.get_variable(name="logstd", shape=[1, nact], initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Exemple #26
0
def strip(var, n_envs, n_steps, flat=False):
    """
    Removes the last step in the batch

    :param var: (TensorFlow Tensor) The input Tensor
    :param n_envs: (int) The number of environments
    :param n_steps: (int) The number of steps to run for each environment
    :param flat: (bool) If the input Tensor is flat
    :return: (TensorFlow Tensor) the input tensor, without the last step in the batch
    """
    out_vars = batch_to_seq(var, n_envs, n_steps + 1, flat)
    return seq_to_batch(out_vars[:-1], flat)
Exemple #27
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps) # Comments by Fei: xs is list of nsteps, each is nenv * nh
            ms = batch_to_seq(M, nenv, nsteps) # Comments by Fei: ms is list of nsteps, each is nenv vector
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) # Comment by Fei: h5 is the same dimension as xs, but with value changed by LSTM. snew is new S
            h5 = seq_to_batch(h5) # Comments by Fei: h5 is nbatch * nh again, just like h4
            pi = fc(h5, 'pi', nact, act=lambda x:x) # Comments by Fei: pi is nbatch * nact
            vf = fc(h5, 'v', 1, act=lambda x:x) # Comments by Fei: vf is nbatch * 1

        v0 = vf[:, 0] # Comments by Fei: v0 is nbatch vector, each value is the value function of a state
        a0 = sample(pi) # Comments by Fei: a0 is nbatch vector, each value is the best choice of action, at that state
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
            return a, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Exemple #28
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x:x)
            vf = fc(h5, 'v', 1, act=lambda x:x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
            return a, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Exemple #29
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Exemple #30
0
    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        h = tf.layers.flatten(X)
        for i in range(len(hiddens) - 1):
            h = utils.fc(h,
                         'mlp_fc{}'.format(i),
                         nh=hiddens[i],
                         init_scale=np.sqrt(2))
            if layer_norm:
                h = tf.contrib.layers.layer_norm(h, center=True, scale=True)
            h = activation(h)

        nlstm = hiddens[-1]

        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

        xs = utils.batch_to_seq(h, nenv, nsteps)
        ms = utils.batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = utils.seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return h, {
            'S': S,
            'M': M,
            'state': snew,
            'initial_state': initial_state
        }
Exemple #31
0
def q_retrace(rewards, dones, q_i, values, rho_i, n_envs, n_steps, gamma):
    """
    Calculates the target Q-retrace

    :param rewards: ([TensorFlow Tensor]) The rewards
    :param dones: ([TensorFlow Tensor])
    :param q_i: ([TensorFlow Tensor]) The Q values for actions taken
    :param values: ([TensorFlow Tensor]) The output of the value functions
    :param rho_i: ([TensorFlow Tensor]) The importance weight for each action
    :param n_envs: (int) The number of environments
    :param n_steps: (int) The number of steps to run for each environment
    :param gamma: (float) The discount value
    :return: ([TensorFlow Tensor]) the target Q-retrace
    """
    rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), n_envs, n_steps,
                           True)  # list of len steps, shape [n_envs]
    reward_seq = batch_to_seq(rewards, n_envs, n_steps,
                              True)  # list of len steps, shape [n_envs]
    done_seq = batch_to_seq(dones, n_envs, n_steps,
                            True)  # list of len steps, shape [n_envs]
    q_is = batch_to_seq(q_i, n_envs, n_steps, True)
    value_sequence = batch_to_seq(values, n_envs, n_steps + 1, True)
    final_value = value_sequence[-1]
    qret = final_value
    qrets = []
    for i in range(n_steps - 1, -1, -1):
        check_shape([
            qret, done_seq[i], reward_seq[i], rho_bar[i], q_is[i],
            value_sequence[i]
        ], [[n_envs]] * 6)
        qret = reward_seq[i] + gamma * qret * (1.0 - done_seq[i])
        qrets.append(qret)
        qret = (rho_bar[i] * (qret - q_is[i])) + value_sequence[i]
    qrets = qrets[::-1]
    qret = seq_to_batch(qrets, flat=True)
    return qret
Exemple #32
0
def strip(var, nenvs, nsteps, flat=False):
    vars = batch_to_seq(var, nenvs, nsteps, flat)
    return seq_to_batch(vars, flat)
Exemple #33
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):
        nenv = nbatch // nsteps

        qmdp_param = {}
        # qmdp_param['K'] = 3
        qmdp_param['obs_len'] = ob_space.shape[0] - ac_space.n
        qmdp_param['num_action'] = ac_space.n
        qmdp_param['num_state'] = 32
        qmdp_param['num_obs'] = 17

        input_len = ob_space.shape
        input_shape = (nbatch, ) + input_len
        num_action = qmdp_param["num_action"]
        obs_len = qmdp_param["obs_len"]
        num_state = qmdp_param['num_state']
        num_obs = qmdp_param['num_obs']

        self.pdtype = make_pdtype(ac_space)
        X = tf.placeholder(tf.float32, input_shape)  #[nbatch,obs+prev action]
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, num_state])  #beliefs

        with tf.variable_scope("model", reuse=reuse):
            xs = batch_to_seq(X, nenv, nsteps)
            #xs originaly [nbatch,input_len]
            #reshape xs to [nenv,nsteps,input_len]
            #split xs along axis=1 to nsteps
            #xs becomes [nsteps,nenv,input_len]
            #dived xs to obs and pre_action
            obs = [x[:, 0:obs_len] for x in xs]
            acts = [x[:, obs_len:] for x in xs]
            ms = batch_to_seq(M, nenv, nsteps)
            #same as xs
            #ms has shape [nsteps,nenv]

            #build variabels
            self.planner_net = PlannerNet("planner", qmdp_param)
            self.filter_net = FilterNet("filter", qmdp_param)

            #calculate action value q, and belief bnew
            s_hist, snew = self.filter_net.beliefupdate(obs, acts, ms, S)
            # s_hist, snew, w_O, Z_o, b_prime_a, b_f = self.filter_net.beliefupdate(obs, acts, ms, S)
            #s_hist: [nstep,nenv,num_state]
            Q = self.planner_net.VI(nbatch)

            # h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            # h5 = seq_to_batch(h5)

            #calculate action and value
            s_hist = seq_to_batch(s_hist)  #[nbatch,num_state]
            q = self.planner_net.policy(Q, s_hist)

            self.pd, self.pi = self.pdtype.pdfromlatent(q)
            vf = fc(q, 'v', 1)  #critic value function

            #pi = fc(h5, 'pi', nact) #actor
            #vf = fc(h5, 'v', 1) #critic value function

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        # self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        self.initial_state = np.ones(
            (nenv, num_state), dtype=np.float32) / num_state

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })
            # a,b,c,d,w_O_val,Z_o_val,b_prime_a_val,b_f_val = sess.run([a0, v0, snew, neglogp0, w_O, Z_o, b_prime_a, b_f], {X:ob, S:state, M:mask})
            # print("w_O: ",w_O_val)
            # print("Z_o: ",Z_o_val)
            # print("b_prime_a_val: ",b_prime_a_val)
            # print("b_f_val: ",b_prime_a_val)
            # return a,b,c,d

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
Exemple #34
0
    def __init__(self, tf_session, ob_space, ac_space, nbatch,
                 reward_redistribution_config, observation_network_config, lstm_network_config, training_config,
                 exploration_config, nsteps, nlstm=64, reuse=False):
        """LSTM policy network, as described in RUDDER paper
        
        Based on baselines.ppo2.policies.py; LSTM layer sees features from it's own trainable observation network and
        the features from the reward redistribution observation network;
        
        Parameters
        -------
        tf_session : tensorflow session
            tensorflow session to compute the graph in
        ob_space
            Baselines ob_space object (see ppo2_rudder.py); must provide .shape attribute for (x, y, c) shapes;
        ac_space
            Baselines ac_space object (see ppo2_rudder.py); must provide .n attribute for number of possible actions;
        nbatch : int
            Batchsize
        nsteps : int
            Fixed number of timesteps to process at once
        reward_redistribution_config : dict
            Dictionary containing config for reward redistribution:
            -----
            lambda_eligibility_trace : float
                Eligibility trace value for redistributed reward
            vf_contrib : float
                Weighting of original value function (vf) vs. redistributed reward (rr), s.t.
                :math:`reward = vf \cdot vf\_contrib + rr \cdot (1-vf\_contrib)`
            use_reward_redistribution_quality_threshold : float
                Quality of reward redistribution has to exceed use_reward_redistribution_quality_threshold to be used;
                use_reward_redistribution_quality_threshold range is [0,1]; Quality measure is the squared prediction
                error, as described in RUDDER paper;
            use_reward_redistribution : bool
                Use reward redistribution?
            rr_junksize : int
                Junksize for reward redistribution; Junks overlap by 1 half each
            cont_pred_w : float
                Weighting of continous prediciton loss vs. prediction loss of final return at last timestep
            intgrd_steps : int
                Stepsize for integrated gradients
            intgrd_batchsize : int
                Integrated gradients is computed batch-wise if intgrd_batchsize > 1
        observation_network_config : dict
            Dictionary containing config for observation network that processes observations and feeds them to LSTM
            network:
            -----
            show_states : bool
                Show frames to network?
            show_statedeltas : bool
                Show frame deltas to network?
            prepoc_states : list of dicts
                Network config to preprocess frames
            prepoc_deltas : list of dicts
                Network config to preprocess frame deltas
            prepoc_observations : list of dicts
                Network config to preprocess features from frame and frame-delta preprocessing networks
        lstm_network_config : dict
            Dictionary containing config for LSTM network:
            -----
            show_actions : bool
                Show taken actions to LSTM?
            reversed : bool
                Process game sequence in reversed order?
            layers : list of dicts
                Network config for LSTM network and optional additional dense layers
            initializations : dict
                Initialization config for LSTM network
            timestep_encoding : dict
                Set "max_value" and "triangle_span" for TeLL.utiltiy.misc_tensorflow.TriangularValueEncoding class
        training_config : dict
            Dictionary containing config for training and update procedure:
            -----
            n_no_rr_updates : int
                Number of updates to perform without training or using reward redistribution network
            n_pretrain_games : int
                Number of games to pretrain the reward redistribution network without using it;
            downscale_lr_policylag : bool
                Downscale learningrate permanently if policy lag gets too large?
            optimizer : tf.train optimizer
                Optimizer in tf.train, e.g. "AdamOptimizer"
            optimizer_params : dict
                Kwargs for optimizer
            l1 : float
                Weighting for l1 weight regularization
            l2 : float
                Weighting for l2 weight regularization
            clip_gradients : float
                Threshold for clipping gradients (clipping by norm)
        exploration_config : dict
            Dictionary containing config for exploration:
            -----
            sample_actions_from_softmax : bool
                True: Apply softmax to policy network output and use it as probabilities to pick an action
                False: Use the max. policy network output as action
            temporal_safe_exploration : bool
                User RUDDER safe exploration
            save_pi_threshold : float
                Threshold value in range [0,1] for safe actions in RUDDER safe exploration
        nlstm : int
            Number of LSTM units (=memory cells)
        reuse : bool
            Reuse tensorflow variables?
        """
        #
        # Shapes
        #
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        seq_ob_shape = (nenv, -1, nh, nw, 1)
        nact = ac_space.n
        
        #
        # Placeholders for inputs
        #
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        
        #
        # Prepare input
        #
        single_frames = tf.cast(tf.reshape(X[..., -1:], shape=seq_ob_shape), dtype=tf.float32)
        delta_frames = single_frames - tf.cast(tf.reshape(X[..., -2:-1], shape=seq_ob_shape), dtype=tf.float32)
        
        #
        #  Get observation features from RR model
        #
        rr_model = RewardRedistributionModel(reward_redistribution_config=reward_redistribution_config,
                                             observation_network_config=observation_network_config,
                                             lstm_network_config=lstm_network_config, training_config=training_config,
                                             scopename="RR")
        self.rr_observation_model = rr_model
        rr_observation_layer = rr_model.get_visual_features(single_frame=single_frames, delta_frame=delta_frames,
                                                            additional_inputs=[])
        
        #
        #  Build policy network
        #
        with tf.variable_scope("model", reuse=reuse):
            temperature = tf.get_variable(initializer=tf.constant(1, dtype=tf.float32), trainable=False,
                                          name='temperature')
            
            additional_inputs = [StopGradientLayer(rr_observation_layer)]
            observation_layers, observation_features = observation_network(
                    single_frame=single_frames, delta_frame=delta_frames, additional_inputs=additional_inputs,
                    observation_network_config=observation_network_config)
            
            self.observation_features_shape = observation_features.get_output_shape()
            
            xs = [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps,
                                                       value=tf.reshape(observation_layers[-1].get_output(),
                                                                        [nenv, nsteps, -1]))]
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            h6 = h5
            pi = fc(h6, 'pi', nact)
            vf = fc(h6, 'v', 1)
        
        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)
        
        if exploration_config['sample_actions_from_softmax']:
            a0 = self.pd.sample_temp(temperature=temperature)
        else:
            a0 = tf.argmax(pi, axis=-1)
        
        v0 = vf[:, 0]
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        
        def step(ob, state, mask):
            a, v, s, neglogp = tf_session.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
            return a, v, s, neglogp
        
        def value(ob, state, mask):
            return tf_session.run(v0, {X:ob, S:state, M:mask})
        
        def action(ob, state, mask, *_args, **_kwargs):
            a, s, neglogp = tf_session.run([a0, snew, neglogp0], {X:ob, S:state, M:mask})
            return a, s, neglogp
        
        #
        # Placeholders for exploration
        #
        n_envs = pi.shape.as_list()[0]
        exploration_timesteps_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,))
        prev_actions_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,))
        gamelengths_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,))
        keep_prev_action_pl = tf.placeholder(dtype=tf.bool, shape=(n_envs,))
        prev_action_count_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,))
        exploration_durations_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,))
        
        #
        # Setting up safe exploration
        #
        explore = tf.logical_and(tf.logical_and(tf.less_equal(exploration_timesteps_pl, gamelengths_pl),
                                                tf.less_equal(gamelengths_pl,
                                                              exploration_timesteps_pl + exploration_durations_pl)),
                                 tf.not_equal(exploration_timesteps_pl, tf.constant(-1, dtype=tf.float32)))

        safe_pi = pi - tf.reduce_min(pi, axis=-1, keep_dims=True)
        safe_pi /= tf.reduce_max(safe_pi, axis=-1, keep_dims=True)
        save_pi_thresholds = (1 - (tf.expand_dims(tf.range(n_envs, dtype=tf.float32), axis=1)
                                   / (n_envs + (n_envs == 1) - 1)) * (1 - exploration_config['save_pi_threshold']))
        safe_pi = tf.cast(tf.greater_equal(safe_pi, save_pi_thresholds), dtype=tf.float32)
        safe_pi /= tf.reduce_sum(safe_pi)
        
        rand_safe_a = tf.multinomial(safe_pi, 1)[:, 0]
        
        safe_pi_flat = tf.reshape(safe_pi, (-1,))
        prev_action_is_safe = tf.gather(safe_pi_flat,
                                        prev_actions_pl + tf.range(safe_pi.shape.as_list()[0], dtype=tf.int64)
                                        * safe_pi.shape.as_list()[1])
        prev_action_is_safe = tf.greater(prev_action_is_safe, tf.constant(0, dtype=tf.float32))
        
        a_explore = tf.where(tf.logical_and(tf.logical_and(keep_prev_action_pl,
                                                           tf.not_equal(gamelengths_pl, exploration_timesteps_pl)),
                                            prev_action_is_safe),
                             prev_actions_pl, rand_safe_a)
        
        a_explore = tf.where(explore, a_explore, a0)
        
        # Make sure the actor doesn't repeat an action too often (otherwise screensaver might start)
        rand_a = tf.random_uniform(shape=a0.get_shape(), minval=0, maxval=ac_space.n, dtype=a0.dtype)
        a_explore = tf.where(tf.greater(prev_action_count_pl, tf.constant(20, dtype=tf.int64)), rand_a, a_explore)
        
        if not exploration_config['temporal_safe_exploration']:
            a_explore = a0
            
        neglogp_explore = self.pd.neglogp(a_explore)
        
        def action_exploration(ob, state, mask, *_args, exploration_timesteps, prev_actions, gamelengths,
                               keep_prev_action, prev_action_count, exploration_durations, **_kwargs):
            """Get actions with exploration for long-term reward"""
            a, s, neglogp = tf_session.run([a_explore, snew, neglogp_explore],
                                  {X: ob, S:state, M:mask, exploration_timesteps_pl: exploration_timesteps,
                                   prev_actions_pl: prev_actions,
                                   gamelengths_pl: gamelengths, exploration_durations_pl: exploration_durations,
                                   keep_prev_action_pl: keep_prev_action, prev_action_count_pl: prev_action_count})
            return a, s, neglogp
        
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
        self.action = action
        self.action_exploration = action_exploration
        self.seq_ob_shape = seq_ob_shape
        self.exploration_config = exploration_config
Exemple #35
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False):
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n

        X = tf.placeholder(tf.float32, ob_shape)  #obs
        I = tf.placeholder(tf.int32, [nbatch, 5])
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states

        # Model
        with tf.variable_scope("model", reuse=reuse):
            # Image Processing
            with tf.variable_scope("cnn"):
                x_image_rep = nature_cnn(X)

            # Instructioin Processing
            with tf.variable_scope("GRU"):
                embedding = tf.get_variable(
                    'word_embedding',
                    shape=[12, 32],
                    initializer=tf.random_uniform_initializer(-1e-3, 1e-3))
                gru_cell = tf.contrib.rnn.GRUCell(
                    num_units=256,
                    kernel_initializer=tf.random_uniform_initializer(
                        -1e-3, 1e-3),
                    bias_initializer=tf.random_uniform_initializer(
                        -1e-3, 1e-3))

                encoder_hidden = gru_cell.zero_state(nbatch, dtype=tf.float32)
                for i in range(5):
                    word_embedding = tf.nn.embedding_lookup(embedding, I[:, i])
                    output, encoder_hidden = gru_cell.call(
                        word_embedding, encoder_hidden)
                x_insts_rep = encoder_hidden

            # Gated-Attention layers
            with tf.variable_scope("x-attn"):
                x_attention = tf.sigmoid(
                    fc(x_insts_rep, 'x-attn', 64, init_scale=1.0))
                x_attention = tf.expand_dims(x_attention, 1)
                x_attention = tf.expand_dims(x_attention, 2)

            with tf.variable_scope("Gated-Attention"):
                x = x_image_rep * x_attention
                x = conv_to_fc(x)
                x = tf.nn.relu(fc(x, 'x-Ga', 256, init_scale=1.0))

            xs = batch_to_seq(x, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h20, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm, init_scale=1.0)
            h20 = seq_to_batch(h20)

            with tf.variable_scope("pi"):
                pi = tf.layers.dense(
                    h20,
                    nact,
                    kernel_initializer=normalized_columns_initializer(0.01))
            with tf.variable_scope("vf"):
                vf = tf.layers.dense(
                    h20,
                    1,
                    kernel_initializer=normalized_columns_initializer(0.01))

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, insts, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                I: insts,
                S: state,
                M: mask
            })

        def value(ob, insts, state, mask):
            return sess.run(v0, {X: ob, I: insts, S: state, M: mask})

        self.X = X
        self.I = I  #
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value

        # start logging
        # =============
        if reuse:
            self.var_summary('./Asset/logdir', sess)
Exemple #36
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, add_flownet,
                 reuse=False,
                 flownet=None, train_from_scratch=False,
                 recurrent=None,
                 large_cnn=False, nlstm=64, add_predicted_flow_to_vec=False, diff_frames=False):
        ob_shape_vec = (nbatch,) + ob_space["vector"].shape
        nh, nw, nc = ob_space["image"].shape
        ob_shape_im = (nbatch, nh, nw, nc)

        actdim = ac_space.shape[0]
        X_vec = tf.placeholder(tf.float32, ob_shape_vec, name='Ob_vec')  # obs
        X_im = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_im')

        if add_flownet:
            # adding previous image placeholder:
            X_p = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_p')  # obs t-1
        else:
            X_p = None

        if recurrent:
            nenv = nbatch // nsteps
            M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
            S = tf.placeholder(tf.float32, [nenv, nlstm*2])  # states

        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            h_im = mujoco_cnn(
                X_im, 'pi', nbatch, add_flownet and not add_predicted_flow_to_vec,
                X_p, flownet,
                train_from_scratch,
                large_cnn, diff_frames)

            if add_predicted_flow_to_vec:
                flow_vec = get_flow_vec(
                    X_im, 'pi', nbatch, add_flownet,
                    X_p, flownet,
                    train_from_scratch,
                    large_cnn, diff_frames)
                h_vec = tf.concat([X_vec, flow_vec], axis=-1)
                h_vec = activ(fc(h_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            else:
                h_vec = activ(fc(X_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            h1 = tf.concat([h_im, h_vec], 1)

            if recurrent:
                xs = batch_to_seq(h1, nenv, nsteps)
                ms = batch_to_seq(M, nenv, nsteps)
                if recurrent == 'lstm':
                    h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
                else:
                    assert recurrent == 'lnlstm'
                    h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
                h2 = seq_to_batch(h5)
            else:
                h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            pi = fc(h2, 'pi', actdim, init_scale=0.01)

            vf = fc(h2, 'vf', 1)
            logstd = tf.get_variable(name="logstd", shape=[1, actdim],
                                     initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        a0_r = self.pd.mode()
        neglogp0 = self.pd.neglogp(a0)
        if not recurrent:
            self.initial_state = None
        else:
            self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)


        self.placeholder_dict = {
            "image": X_im,
            "vector": X_vec
        }
        if add_flownet:
            self.placeholder_dict["last_image"] = X_p

        if not recurrent:
            def step(ob, *_args, remove_noise=False, **_kwargs):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                if not remove_noise:
                    a, v, neglogp = sess.run([a0, v0, neglogp0], feed_dict=feed_dict)
                else:
                    a, v, neglogp = sess.run([a0_r, v0, neglogp0], feed_dict=feed_dict)
                return a, v, self.initial_state, neglogp

            def value(ob, *_args, **_kwargs):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                return sess.run(v0, feed_dict=feed_dict)
        else:
            def step(ob, state, mask, remove_noise=False):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                feed_dict[S] = state
                feed_dict[M] = mask
                if not remove_noise:
                    a, v, s, neglogp = sess.run([a0, v0, snew, neglogp0], feed_dict=feed_dict)
                else:
                    a, v, s, neglogp = sess.run([a0_r, v0, snew, neglogp0], feed_dict=feed_dict)
                return a, v, s, neglogp

            def value(ob, state, mask):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                feed_dict[S] = state
                feed_dict[M] = mask
                return sess.run(v0, feed_dict=feed_dict)

        self.X_im = X_im
        self.X_vec = X_vec
        self.X_p = X_p
        self.pi = pi
        if not recurrent:
            self.vf = v0
        else:
            self.vf = vf
            self.M = M
            self.S = S
        self.step = step
        self.value = value
Exemple #37
0
def strip(var, nenvs, nsteps, flat = False):
    vars = batch_to_seq(var, nenvs, nsteps + 1, flat)
    return seq_to_batch(vars[:-1], flat)