Ejemplo n.º 1
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        with tf.variable_scope("model", reuse=reuse):
            h = ppo_cnn_model(processed_x)
            v = tf.layers.dense(h, 1, name='v')
            vf = tf.squeeze(v, axis=[1])
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 2
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_batch,
                 n_steps,
                 n_lstm=256,
                 reuse=False):
        """
        Policy object for A2C

        :param sess: (TensorFlow session) The current TensorFlow session
        :param ob_space: (Gym Space) The observation space of the environment
        :param ac_space: (Gym Space) The action space of the environment
        :param n_batch: (int) The number of batch to run (n_envs * n_steps)
        :param n_steps: (int) The number of steps to run for each environment
        :param n_lstm: (int) The number of LSTM cells (for reccurent policies)
        :param reuse: (bool) If the policy is reusable or not
        """
        self.n_env = n_batch // n_steps
        self.obs_ph, self.processed_x = observation_input(ob_space, n_batch)
        self.masks_ph = tf.placeholder(tf.float32,
                                       [n_batch])  # mask (done t-1)
        self.states_ph = tf.placeholder(tf.float32,
                                        [self.n_env, n_lstm * 2])  # states
        self.pdtype = make_proba_dist_type(ac_space)
        self.sess = sess
        self.reuse = reuse
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
            flatten = tf.layers.flatten
            pi_h1 = activ(
                fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            vf_h1 = activ(
                fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:, 0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 5
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, name='policy', **conv_kwargs): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope(name, reuse=reuse):
            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def step_test(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([tf.argmax(self.pd.logits, axis=-1), vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def step_policyflat(ob, *_args, **_kwargs):
            a, v, neglogp, polciyflat = sess.run([a0, vf, neglogp0, self.pd.logits], {X:ob})
            # a, v, self.initial_state, neglogp = self.step(ob, *_args, **_kwargs)
            # pa = np.exp(-neglogp)
            return a, v, self.initial_state, neglogp, polciyflat

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.step_test = step_test
        self.step_policyflat = step_policyflat
        self.value = value
Ejemplo n.º 6
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
            processed_x = tf.layers.flatten(processed_x)
            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))  # policy
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))  # value function
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:, 0]

            # pdtype-概率分布的参数化族
            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)

        # 根据action space创建相应的参数化分布.如这里action space是Discrete(4),那分布
        # 就是CategoricalPdType().然后根据该分布类型,结合网络输出(pi),得到动作概率分
        # 布CategoricalPd,最后在该分布上采样,得到动作a0.而neglogp0即为该动作的自信息量
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 7
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            h, self.dropout_assign_ops = choose_cnn(processed_x)
            vf = fc(h, 'v', 1)[:, 0]
            lp = fc(h, 'lp', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, l, neglogp = sess.run([a0, vf, lp, neglogp0], {X: ob})
            return a, v, l, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.lp = lp
        self.step = step
        self.value = value
Ejemplo n.º 8
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 arch='impala',
                 use_batch_norm=True,
                 dropout=0,
                 **conv_kwargs):
        self.pdtype = make_pdtype(ac_space)

        X, processed_x = observation_input(ob_space, nbatch)
        scaled_images = tf.cast(processed_x, tf.float32) / 255.

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            h, self.dropout_assign_ops = random_impala_cnn(
                scaled_images, use_batch_norm=use_batch_norm, dropout=dropout)
            vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            clean_h, _ = impala_cnn(scaled_images,
                                    use_batch_norm=use_batch_norm,
                                    dropout=dropout)
            clean_vf = fc(clean_h, 'v', 1)[:, 0]
            self.clean_pd, self.clean_pi = self.pdtype.pdfromlatent(
                clean_h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)

        clean_a0 = self.clean_pd.sample()
        clean_neglogp0 = self.clean_pd.neglogp(clean_a0)

        self.initial_state = None

        def step(ob, clean_flag, *_args, **_kwargs):
            a, v, neglogp, c_a, c_v, c_neglogp \
            = sess.run([a0, vf, neglogp0, clean_a0, clean_vf, clean_neglogp0], {X:ob})
            if clean_flag:
                return c_a, c_v, self.initial_state, c_neglogp
            else:
                return a, v, self.initial_state, neglogp

        def value(ob, clean_flag, *_args, **_kwargs):
            v, c_v = sess.run([vf, clean_vf], {X: ob})
            if clean_flag:
                return c_v
            else:
                return v

        self.X = X
        self.H = h
        self.CH = clean_h
        self.vf = vf
        self.clean_vf = clean_vf

        self.step = step
        self.value = value
Ejemplo n.º 9
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
            processed_x = tf.layers.flatten(processed_x)
            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:,0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)


        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 10
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 11
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        #X, processed_x = observation_input(ob_space, nbatch)
        X, processed_x = observation_input(ob_space, None)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x, **conv_kwargs)
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None
        
        self.entropy = cat_entropy(self.pi)

        def step(ob, *_args, **_kwargs):
            a, neglogp = sess.run([a0, neglogp0], {X:ob})
            return a, self.initial_state, neglogp

        #def value(ob, *_args, **_kwargs):
        #    return sess.run(vf, {X:ob})

        def neg_log_prob(actions):
            return tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.pi, labels=actions)

        self.X = X
        self.step = step
        #self.value = value
        self.neg_log_prob = neg_log_prob
Ejemplo n.º 12
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        X, processed_x = observation_input(ob_space, nbatch)
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 13
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)

        X, processed_x = observation_input(ob_space, nbatch)
        scaled_images = tf.cast(processed_x, tf.float32) / 255.
        mc_index = tf.placeholder(tf.int64, shape=[1], name='mc_index')

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            h, self.dropout_assign_ops = random_impala_cnn(scaled_images)
            vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            clean_h, _ = impala_cnn(scaled_images)
            clean_vf = fc(clean_h, 'v', 1)[:, 0]
            self.clean_pd, self.clean_pi = self.pdtype.pdfromlatent(
                clean_h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)

        clean_a0 = self.clean_pd.sample()
        clean_neglogp0 = self.clean_pd.neglogp(clean_a0)

        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        def step_with_clean(flag, ob, *_args, **_kwargs):
            a, v, neglogp, c_a, c_v, c_neglogp \
            = sess.run([a0, vf, neglogp0, clean_a0, clean_vf, clean_neglogp0], {X:ob})
            if flag:
                return c_a, c_v, self.initial_state, c_neglogp
            else:
                return a, v, self.initial_state, neglogp

        def value_with_clean(flag, ob, *_args, **_kwargs):
            v, c_v = sess.run([vf, clean_vf], {X: ob})
            if flag:
                return c_v
            else:
                return v

        self.X = X
        self.H = h
        self.CH = clean_h
        self.vf = vf
        self.clean_vf = clean_vf

        self.step = step
        self.value = value
        self.step_with_clean = step_with_clean
        self.value_with_clean = value_with_clean
Ejemplo n.º 14
0
    def __init__(self, observation_space, name=None):
        """
        Creates an input placeholder tailored to a specific observation space

        :param observation_space: (Gym Space) observation space of the environment. Should be one of the gym.spaces
            types
        :param name: (str) tensorflow name of the underlying placeholder
        """
        inpt, self.processed_inpt = observation_input(observation_space, name=name)
        super().__init__(inpt)
Ejemplo n.º 15
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        img_X, processed_img_x = observation_input(ob_space[0], nbatch)
        vec_X, processed_vec_x = observation_input(ob_space[1], nbatch)
        with tf.variable_scope("model", reuse=reuse):
            # img feature extractor
            img_h = vgg19_cnn(processed_img_x, **conv_kwargs)

            # vec feature extractor
            activ = tf.nn.relu

            vec_h1 = activ(fc(processed_vec_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            vec_h = activ(fc(vec_h1, 'pi_fc2', nh=128, init_scale=np.sqrt(2)))

            # feature concat
            h = tf.concat([img_h,vec_h],1)

            vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            img_x_input = ob[0]
            vec_x_input = ob[1]

            a, v, neglogp = sess.run([a0, vf, neglogp0], {img_X:img_x_input,vec_X:vec_x_input})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            img_x_input = ob[0]
            vec_x_input = ob[1]

            return sess.run(vf, {img_X:img_x_input,vec_X:vec_x_input})

        self.img_X = img_X
        self.vec_X = vec_X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 16
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 create_additional=True,
                 nlstm=256):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            h, self.dropout_assign_ops = choose_cnn(processed_x)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            if (create_additional):
                vf = fc(h5, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        a0 = self.pd.sample()
        if (create_additional):
            neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            if (create_additional):
                a, v, s, neglogp = sess.run([a0, vf, snew, neglogp0], {
                    X: ob,
                    S: state,
                    M: mask
                })
            else:
                a, s = sess.run([a0, snew], {X: ob, S: state, M: mask})
                v = np.zeros_like(a)
                neglogp = np.zeros_like(a)
            return a, v, s, neglogp

        def value(ob, state, mask):
            return sess.run(vf, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        if (create_additional):
            self.vf = vf
            self.value = value
        self.step = step
Ejemplo n.º 17
0
    def __init__(self, observation_space, name=None, extra_channels=0):
        """Creates an input placeholder tailored to a specific observation space

        Parameters
        ----------

        observation_space:
                observation space of the environment. Should be one of the gym.spaces types
        name: str
                tensorflow name of the underlying placeholder
        """
        inpt, self.processed_inpt = observation_input(observation_space, name=name, extra_channels=extra_channels)
        super().__init__(inpt)
Ejemplo n.º 18
0
    def __init__(self, observation_space, name=None):
        """Creates an input placeholder tailored to a specific observation space

        Parameters
        ----------

        observation_space:
                observation space of the environment. Should be one of the gym.spaces types
        name: str
                tensorflow name of the underlying placeholder
        """
        inpt, self.processed_inpt = observation_input(observation_space, name=name)
        super().__init__(inpt)
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=tf.AUTO_REUSE,
                 policy_scope='',
                 value_scope=''):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model" + policy_scope, reuse=tf.AUTO_REUSE):
            X, processed_x = observation_input(ob_space, nbatch)
            print(X)
            activ = tf.tanh
            processed_x = tf.layers.flatten(processed_x)
            pi_h1 = activ(
                fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            self.pd, self.pi = self.pdtype.pdfromlatent(
                pi_h2,
                init_scale=0.01)  # pd->probability distribution; pi->policy
        with tf.variable_scope("model" + value_scope, reuse=tf.AUTO_REUSE):
            vf_h1 = activ(
                fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:, 0]

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        def value_pi(ob, *_args, **_kwargs):
            pass

        def neg_log_prob(actions):
            return self.pd.neglogp(actions)

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
        self.neg_log_prob = neg_log_prob
        self.entropy = self.pd.entropy()
Ejemplo n.º 20
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 create_additional=True,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            h, self.dropout_assign_ops = choose_cnn(processed_x)
            if create_additional:
                vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        if (create_additional):
            neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            if create_additional:
                a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            else:
                a = sess.run(a0, {X: ob})
                v = np.zeros_like(a)
                neglogp = np.zeros_like(a)
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        if create_additional:
            self.vf = vf
            self.value = value
        self.step = step
Ejemplo n.º 21
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 arch='impala',
                 use_batch_norm=True,
                 dropout=0,
                 **conv_kwargs):
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            processed_x3 = processed_x

            h, self.dropout_assign_ops = choose_cnn(
                processed_x3,
                arch=arch,
                use_batch_norm=use_batch_norm,
                dropout=dropout)
            vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, clean_flag, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, clean_flag, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 22
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 23
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            h, self.dropout_assign_ops = choose_cnn(processed_x)
            with tf.variable_scope("policy", reuse=tf.AUTO_REUSE):
                vf = fc(h, 'v', 1)[:, 0]
                self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

            # Dipam: Add discrimiator network on h
            with tf.variable_scope("discriminator", reuse=tf.AUTO_REUSE):
                discfc1 = tf.nn.tanh(fc(h, 'discL1', 100))
                disc_logits = fc(discfc1, 'disc', 2)

        #probd = self.pd.flatparam()
        #greedyaction = self.pd.mode()

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            #a, v, neglogp, pdout, a_greedy = sess.run([a0, vf, neglogp0, probd, greedyaction], {X:ob})
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            #return a, v, self.initial_state, neglogp, pdout, a_greedy
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
        self.disc_logits = disc_logits
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)
        print(processed_x)

        with tf.variable_scope("model", reuse=reuse):
            conv = Conv2D(64, (3, 3), padding='same')(processed_x)
            conv = Conv2D(32, (3, 3), padding='same')(conv)
            flat = Flatten()(conv)
            dense = Dense(100, activation='relu')(flat)
            vf = Dense(1)(dense)
            self.pd, self.pi = self.pdtype.pdfromlatent(dense)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            # v = np.array(v)
            return a, v[:, 0], self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})[:, 0]

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 25
0
         # renormalization for continual or (especially!) transfer learning.
     # Attention:
         # Attentive RNN image recognition less susceptible to adversarial prtbs?
         # Can working memory shape modular structure of network?
         # RRNN (Recursive Recurrent NNs):
             # like capsule network but more flexible
             # reusable capsules; dynamically determined depth
             # however, I like the idea of activation as feature existence probablty
             # Q: could these be good for transfer learning? Maybe.
         # Tensor valued working memories (generalizing NTM)
 # List of job application plans
     # OpenAI fellowship
     # AI2 software engineer
     # AI for Brain Science -- look up positions
     # Google DeepMind -- research engineer
 X, processed_x = observation_input(ob_space, nbatch)
 M = tf.placeholder(tf.float32, [nbatch]) #mask
 self.pdtype = make_pdtype(ac_space)
 with tf.variable_scope('model', reuse=reuse):
     h = caps_cnn(processed_x)
     h = capsule_conv(h, 'capsconv', 4, 2, 32, 8)
     h = capsule(h, 'caps', 16, 8, from_conv=True)
     vf = fc(h, 'v', 1)[:, 0] # value function
     # for discrete action spaces, create a final capsule layer
     # one capsule for each possible action
     if isinstance(ac_space, spaces.Discrete): 
         p = capsule(h, 'pcaps', ac_space.n, 4, from_conv=False)
         pnorm = tf.reduce_sum(tf.square(p), axis=2)
         self.pd, self.pi = self.pdtype.pdfromflat(pnorm), pnorm
     else:
         self.pd, self.pi = self.pdtype.pdfromlatent(h)
Ejemplo n.º 26
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)

        # explicitly create  vector space for latent vectors
        latent_space = Box(-np.inf, np.inf, shape=(256, ))
        # So that I can compute the saliency map
        if Config.REPLAY:
            X = tf.placeholder(shape=(nbatch, ) + ob_space.shape,
                               dtype=np.float32,
                               name='Ob')
            processed_x = X
            # create placeholders for custom loss
            ANCHORS = tf.placeholder(shape=(nbatch, ) + ob_space.shape,
                                     dtype=np.float32,
                                     name='anch')
            POST_TRAJ = tf.placeholder(shape=(nbatch, ) + ob_space.shape,
                                       dtype=np.float32,
                                       name='post_traj')
            NEG_TRAJ = tf.placeholder(shape=(nbatch, ) + ob_space.shape,
                                      dtype=np.float32,
                                      name='neg_traj')
        else:
            X, processed_x = observation_input(ob_space, nbatch)
            ANCHORS, PROC_ANCH = observation_input(ob_space,
                                                   Config.REP_LOSS_M *
                                                   Config.NUM_ENVS,
                                                   name='anch')
            POST_TRAJ, PROC_POS = observation_input(ob_space,
                                                    Config.REP_LOSS_M *
                                                    Config.NUM_ENVS,
                                                    name='pos_traj')
            NEG_TRAJ, PROC_NEG = observation_input(
                ob_space,
                Config.REP_LOSS_M * Config.NUM_ENVS * Config.NEGS,
                name='neg_traj')
            print('bob', ob_space)
            print(type(ob_space))
            AVG_REPS, AVG_REP_PROC = observation_input(latent_space, nbatch)
            # observation input
        with tf.variable_scope("model", reuse=tf.compat.v1.AUTO_REUSE):
            act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn(
                processed_x)
            self.train_dropout_assign_ops = fast_dropout_assigned_ops
            self.run_dropout_assign_ops = slow_dropout_assign_ops
            # stack together action invariant & conditioned layers for full representation layer
            self.h = tf.concat([act_condit, act_invariant], axis=1)
            # concat average phi vector
            self.h_avg = tf.concat([self.h, AVG_REP_PROC], axis=1)
            self.h_vf = self.h_avg

            # NOTE: (Ahmed) I commented out all the IBAC-SNI settings to make this easier to read
            # since we shouldn't be using any of these settings anyway.

            # Noisy policy and value function for train
            # if Config.BETA >= 0:
            #     pdparam = _matching_fc(self.h, 'pi', ac_space.n, init_scale=1.0, init_bias=0)
            #     pdparam = tf.reshape(pdparam, shape=(Config.NR_SAMPLES, -1, ac_space.n))
            #     pdparam = tf.transpose(pdparam, perm=[1,0,2])

            #     dists = ds.Categorical(logits=pdparam)
            #     self.pd_train = ds.MixtureSameFamily(
            #         mixture_distribution=ds.Categorical(probs=[1./Config.NR_SAMPLES]*Config.NR_SAMPLES),
            #         components_distribution=dists)
            #     self.pd_train.neglogp = lambda a: - self.pd_train.log_prob(a)
            #     self.vf_train = tf.reduce_mean(tf.reshape(fc(self.h, 'v', 1), shape=(Config.NR_SAMPLES, -1, 1)), 0)[:, 0]
            # else:
            self.pd_train, _ = self.pdtype.pdfromlatent(self.h_avg,
                                                        init_scale=0.01)
            self.vf_train = fc(self.h, 'v', 1)[:, 0]

            # if Config.SNI:
            #     assert Config.DROPOUT == 0
            #     assert not Config.OPENAI
            #     # Used with VIB: Noiseless pd_run and _both_ value functions
            #     print("Activating SNI (includes VF)")

            #     # Use deterministic value function for both as VIB for regression seems like a bad idea
            #     self.vf_run = self.vf_train = fc(self.h_vf, 'v', 1)[:, 0]

            #     # Have a deterministic run policy based on the mean
            #     self.pd_run, _ = self.pdtype.pdfromlatent(self.h_vf, init_scale=0.01)
            # elif Config.SNI2:
            #     assert not Config.OPENAI
            #     # Used with Dropout instead of OPENAI modifier
            #     # 'RUN' versions are updated slowly, train versions updated faster, gradients are mixed
            #     print("Activating SNI2")

            #     # Deterministic bootstrap value... doesn't really matter but this is more consistent
            #     self.vf_run = fc(h_vf, 'v', 1)[:, 0]

            #     # Run policy based on slow changing latent
            #     self.pd_run, _ = self.pdtype.pdfromlatent(h_vf, init_scale=0.01)
            #     # Train is updated for each gradient update, slow is only updated once per batch
            # elif Config.OPENAI:
            #     # Completely overwrite train versions as everything changes slowly
            #     # Train version is same as run version, both of which are slow
            #     self.pd_run, _ = self.pdtype.pdfromlatent(h_vf, init_scale=0.01)
            #     self.pd_train = self.pd_run
            #     self.vf_run = self.vf_train = fc(h_vf, 'v', 1)[:, 0]

            #     # Stochastic version is never used, so can set to ignore
            #     self.train_dropout_assign_ops = []
            # else:
            # Plain Dropout version: Only fast updates / stochastic latent for VIB
            self.pd_run = self.pd_train
            self.vf_run = self.vf_train

            # For Dropout: Always change layer, so slow layer is never used
            self.run_dropout_assign_ops = []

        # Old aidl version
        # with tf.variable_scope("model", reuse=True) as scope:
        #     y = tf.constant([1.0, 0.0])
        #     _, anchor_rep, _, _ = choose_cnn(PROC_ANCH)

        #     _, pos_rep, _, _ = choose_cnn(PROC_POS)

        #     _, neg_rep, _, _ = choose_cnn(PROC_NEG)

        #     # (num_envs, m, nodes)
        #     anchor_rep = tf.reshape(anchor_rep, [Config.NUM_ENVS, Config.REP_LOSS_M, -1])
        #     pos_rep = tf.reshape(pos_rep, [Config.NUM_ENVS, Config.REP_LOSS_M, -1])

        #     # (neg_samples, num_envs, m, nodes)
        #     neg_rep = tf.reshape(neg_rep, [Config.NEGS, Config.NUM_ENVS, Config.REP_LOSS_M, -1])

        #     # (num_envs, m) multiply all representation layers across envs, and trajectories
        #     pos_matr = tf.einsum('aij,aij->ai', anchor_rep, pos_rep)

        #     # logit for positive sample and anchor
        #     pos_logit = tf.expand_dims(tf.reduce_mean(pos_matr), axis=0)
        #     # (neg_samples, num_envs, m) multiply all representation layers across envs, and trajectories
        #     # for each negative sample
        #     neg_matr = tf.einsum('aij,kaij->kai', anchor_rep, neg_rep)

        #     # get average over negative samples to find logits
        #     neg_logits = tf.math.reduce_mean(neg_matr, axis=(1, 2))

        #     # TODO put back in tanh clamping in case things get unstable with InfoNCE
        #     logits = tf.concat([pos_logit, neg_logits], axis=0)
        #     # bce = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logit)
        #     # loss is negative of first logit, which is positive samp/ anchor
        #     neg_probs = tf.math.negative(tf.nn.log_softmax(logits))
        #     self.rep_loss = neg_probs[0]*Config.REP_LOSS_WEIGHT*-1

        # with tf.variable_scope("model", reuse=tf.compat.v1.AUTO_REUSE):
        #     params = tf.trainable_variables()
        #     # Apply custom loss
        #     trainer = None
        #     if Config.SYNC_FROM_ROOT:
        #         trainer = MpiAdamOptimizer(MPI.COMM_WORLD, epsilon=1e-5)
        #     else:
        #         trainer = tf.train.AdamOptimizer( epsilon=1e-5)
        #     rep_params = params[:-6]
        #     grads_and_var = trainer.compute_gradients(self.rep_loss, rep_params)
        #     grads, var = zip(*grads_and_var)
        #     if max_grad_norm is not None:
        #         grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        #     grads_and_var = list(zip(grads, var))
        #     _custtrain = trainer.apply_gradients(grads_and_var)

        # Used in step
        a0_run = self.pd_run.sample()
        neglogp0_run = self.pd_run.neglogp(a0_run)
        self.initial_state = None

        def step(ob, phi_bar, update_frac, *_args, **_kwargs):
            if Config.REPLAY:
                ob = ob.astype(np.float32)
            a, v, neglogp = sess.run([a0_run, self.vf_run, neglogp0_run], {
                X: ob,
                AVG_REP_PROC: phi_bar
            })
            return a, v, self.initial_state, neglogp

        def rep_vec(ob, *_args, **_kwargs):
            return sess.run(self.h, {X: ob})

        def value(ob, update_frac, *_args, **_kwargs):
            return sess.run(self.vf_run, {X: ob})

        def custom_train(anchors, pos_traj, neg_traj):
            return sess.run([self.rep_loss, _custtrain], {
                ANCHORS: anchors,
                POST_TRAJ: pos_traj,
                NEG_TRAJ: neg_traj
            })[:-1]

        self.X = X
        self.ANCHORS = ANCHORS
        self.POST_TRAJ = POST_TRAJ
        self.NEG_TRAJ = NEG_TRAJ
        self.processed_x = processed_x
        self.step = step
        self.value = value
        self.custom_train = custom_train
        self.rep_vec = rep_vec
        self.AVG_REP_PROC = AVG_REP_PROC
Ejemplo n.º 27
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        self.rep_loss = None
        # explicitly create  vector space for latent vectors
        latent_space = Box(-np.inf, np.inf, shape=(256, ))
        # So that I can compute the saliency map
        if Config.REPLAY:
            X = tf.compat.v1.placeholder(shape=(nbatch, ) + ob_space.shape,
                                         dtype=np.float32,
                                         name='Ob')
            processed_x = X
        else:
            X, processed_x = observation_input(ob_space, None)
            TRAIN_NUM_STEPS = Config.NUM_STEPS // 16
            REP_PROC = tf.compat.v1.placeholder(dtype=tf.float32,
                                                shape=(None, 64, 64, 3),
                                                name='Rep_Proc')
            Z_INT = tf.compat.v1.placeholder(dtype=tf.int32,
                                             shape=(),
                                             name='Curr_Skill_idx')
            Z = tf.compat.v1.placeholder(dtype=tf.float32,
                                         shape=(None, Config.N_SKILLS),
                                         name='Curr_skill')
            CLUSTER_DIMS = 128
            HIDDEN_DIMS_SSL = 256
            self.protos = tf.compat.v1.Variable(
                initial_value=tf.random.normal(shape=(CLUSTER_DIMS,
                                                      Config.N_SKILLS)),
                trainable=True,
                name='Prototypes')
            self.A = self.pdtype.sample_placeholder([None], name='A')
            # trajectories of length m, for N policy heads.
            self.STATE = tf.compat.v1.placeholder(tf.float32,
                                                  [None, 64, 64, 3])
            self.STATE_NCE = tf.compat.v1.placeholder(
                tf.float32, [Config.REP_LOSS_M, 1, None, 64, 64, 3])
            self.ANCH_NCE = tf.compat.v1.placeholder(tf.float32,
                                                     [None, 64, 64, 3])
            # labels of Q value quantile bins
            self.LAB_NCE = tf.compat.v1.placeholder(
                tf.float32, [Config.POLICY_NHEADS, None])
            self.A_i = self.pdtype.sample_placeholder(
                [None, Config.REP_LOSS_M, 1], name='A_i')
            self.R_cluster = tf.compat.v1.placeholder(tf.float32, [None],
                                                      name='R_cluster')
            self.A_cluster = self.pdtype.sample_placeholder([None],
                                                            name='A_cluster')

        X = REP_PROC  #tf.reshape(REP_PROC, [-1, 64, 64, 3])

        with tf.compat.v1.variable_scope("target",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            with tf.compat.v1.variable_scope("value",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn(
                    X)

        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            with tf.compat.v1.variable_scope("value",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                self.h_v = tf.concat([act_condit, act_invariant], axis=1)

        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            with tf.compat.v1.variable_scope("policy",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                act_condit_pi, act_invariant_pi, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn(
                    X)
                self.train_dropout_assign_ops = fast_dropout_assigned_ops
                self.run_dropout_assign_ops = slow_dropout_assign_ops

        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            with tf.compat.v1.variable_scope("policy",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                self.h_pi = tf.concat([act_condit_pi, act_invariant_pi],
                                      axis=1)
                act_one_hot = tf.reshape(tf.one_hot(self.A, ac_space.n),
                                         (-1, ac_space.n))
                self.adv_pi = get_linear_layer(n_in=256 + 15, n_out=1)(
                    tf.concat([self.h_pi, act_one_hot], axis=1))
                self.v_pi = get_linear_layer(n_in=256, n_out=1)(self.h_pi)
        """
        Clustering part
        """

        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            with tf.compat.v1.variable_scope("value",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                # h_codes: n_batch x n_t x n_rkhs
                act_condit, act_invariant, _, _ = choose_cnn(X)
                self.h_codes = tf.transpose(
                    tf.reshape(tf.concat([act_condit, act_invariant], axis=1),
                               [-1, Config.NUM_ENVS, 256]), (1, 0, 2))
                act_one_hot = tf.transpose(
                    tf.reshape(tf.one_hot(self.A_cluster, ac_space.n),
                               [-1, Config.NUM_ENVS, ac_space.n]), (1, 0, 2))
                h_acc = []
                for k in range(Config.CLUSTER_T):
                    h_t = self.h_codes[:, k:tf.shape(self.h_codes)[1] -
                                       (Config.CLUSTER_T - k - 1)]
                    a_t = act_one_hot[:, k:tf.shape(act_one_hot)[1] -
                                      (Config.CLUSTER_T - k - 1)]
                    h_t = tf.reshape(
                        FiLM(widths=[128], name='FiLM_layer')([
                            tf.expand_dims(
                                tf.expand_dims(tf.reshape(h_t, (-1, 256)), 1),
                                1),
                            tf.reshape(a_t, (-1, 15))
                        ])[:, 0, 0], (Config.NUM_ENVS, -1, 256))
                    h_acc.append(h_t)

                h_seq = tf.reshape(tf.concat(h_acc, 2),
                                   (-1, 256 * Config.CLUSTER_T))

                self.z_t = get_online_predictor(n_in=256 * Config.CLUSTER_T,
                                                n_out=CLUSTER_DIMS,
                                                prefix='SH_z_pred')(h_seq)

                self.u_t = get_predictor(n_in=CLUSTER_DIMS,
                                         n_out=CLUSTER_DIMS,
                                         prefix='SH_u_pred')(self.z_t)

        self.z_t_1 = self.z_t
        # scores: n_batch x n_clusters
        scores = tf.linalg.matmul(
            tf.linalg.normalize(self.z_t_1, axis=1, ord='euclidean')[0],
            tf.linalg.normalize(self.protos, axis=1, ord='euclidean')[0])
        self.codes = sinkhorn(scores=scores)

        if Config.MYOW:
            """
            Compute average cluster reward 1/N_i \sum_{C_i} V^pi(s_j)

            TODO: mine nearby representations of [st,stp1] with [st,at,stp1]? these two should be close if transitions are deterministic
            """
            cluster_idx = tf.argmax(scores, 1)
            if False:
                reward_scale = []
                for i in range(Config.N_SKILLS):
                    filter_ = tf.cast(tf.fill(tf.shape(self.R_cluster), i),
                                      tf.float32)
                    mask = tf.cast(tf.math.equal(filter_, self.codes),
                                   tf.float32)
                    rets_cluster = tf.reduce_mean(mask * self.R_cluster)
                    reward_scale.append(rets_cluster)
                self.cluster_returns = tf.stack(reward_scale)
                # Predict the average cluster value from the prototype (centroid)
                with tf.compat.v1.variable_scope(
                        "online", reuse=tf.compat.v1.AUTO_REUSE):
                    self.cluster_value_mse_loss = tf.reduce_mean(
                        (get_predictor(n_in=CLUSTER_DIMS, n_out=1)(
                            tf.transpose(self.protos)) -
                         self.cluster_returns)**2)
            else:
                self.cluster_value_mse_loss = 0.
            """
            MYOW where k-NN neighbors are replaced by Sinkhorn clusters
            """
            with tf.compat.v1.variable_scope("random",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                # h_codes: n_batch x n_t x n_rkhs
                act_condit_target, act_invariant_target, _, _ = choose_cnn(X)
                h_codes_target = tf.transpose(
                    tf.reshape(
                        tf.concat([act_condit_target, act_invariant_target],
                                  axis=1), [-1, Config.NUM_ENVS, 256]),
                    (1, 0, 2))
                h_t_target = h_codes_target[:, :-1]
                h_tp1_target = h_codes_target[:, 1:]

                # h_a_t = tf.transpose(tf.reshape(get_predictor(n_in=ac_space.n,n_out=256,prefix="SH_a_emb")( act_one_hot), (-1,Config.NUM_ENVS,256)), (1,0,2))
                h_seq_target = tf.reshape(
                    tf.concat([h_t_target, h_tp1_target], 2),
                    (-1, 256 * Config.CLUSTER_T))
                # act_one_hot_target = tf.reshape(tf.one_hot(self.A_cluster,ac_space.n), (-1,ac_space.n))
                # h_seq_target = tf.squeeze(tf.squeeze(FiLM(widths=[512,512], name='FiLM_layer')([tf.expand_dims(tf.expand_dims(h_seq_target,1),1), act_one_hot_target]),1),1)
            y_online = h_seq
            y_target = tf.stop_gradient(h_seq_target)
            # y_reward = tf.reshape(self.R_cluster,(-1,1))

            # get K closest vectors by Sinkhorn scores
            # dist = _compute_distance(y_reward,y_reward)
            dist = _compute_distance(y_online, y_target)
            k_t = 3
            vals, indx = tf.nn.top_k(-dist, k_t + 1, sorted=True)

            # N_target = y_target
            with tf.compat.v1.variable_scope("online",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                v_online_net = get_predictor(n_in=256 * Config.CLUSTER_T,
                                             n_out=HIDDEN_DIMS_SSL,
                                             prefix='MYOW_v_pred')
                r_online_net = get_predictor(n_in=HIDDEN_DIMS_SSL,
                                             n_out=HIDDEN_DIMS_SSL,
                                             prefix='MYOW_r_pred')
                v_online = v_online_net(y_online)
                r_online = r_online_net(v_online)
            with tf.compat.v1.variable_scope("target",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                v_target_net = get_predictor(n_in=256 * Config.CLUSTER_T,
                                             n_out=HIDDEN_DIMS_SSL,
                                             prefix='MYOW_v_pred')
                r_target_net = get_predictor(n_in=HIDDEN_DIMS_SSL,
                                             n_out=HIDDEN_DIMS_SSL,
                                             prefix='MYOW_r_pred')

            self.myow_loss = 0.
            for k in range(k_t):
                indx2 = indx[:, k]
                N_target = tf.gather(y_target, indx2)
                v_target = v_target_net(N_target)
                r_target = r_target_net(v_target)

                self.myow_loss += tf.reduce_mean(cos_loss(
                    r_online,
                    v_target))  #+ tf.reduce_mean(cos_loss(r_target, v_online))

            # with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE):
            #     phi_s = get_online_predictor(n_in=256,n_out=CLUSTER_DIMS,prefix='SH_z_pred')(tf.reshape(h_acc[-1],(-1,256)))
            #     self.myow_loss += tf.reduce_mean(cos_loss(phi_s, tf.transpose(tf.gather(self.protos,cluster_idx,axis=1),(1,0)) ))

            self.myow_loss += self.cluster_value_mse_loss

        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            with tf.compat.v1.variable_scope("policy",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                self.pd_train = [
                    self.pdtype.pdfromlatent(self.h_pi, init_scale=0.01)[0]
                ]

            with tf.compat.v1.variable_scope("value",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                self.vf_train = [fc(self.h_v, 'v_0', 1)[:, 0]]

                # Plain Dropout version: Only fast updates / stochastic latent for VIB
                self.pd_run = self.pd_train
                self.vf_run = self.vf_train

                # For Dropout: Always change layer, so slow layer is never used
                self.run_dropout_assign_ops = []

        # Use the current head for classical PPO updates
        a0_run = [self.pd_run[0].sample()]
        neglogp0_run = [self.pd_run[0].neglogp(a0_run[0])]
        self.initial_state = None

        def step(ob,
                 update_frac,
                 skill_idx=None,
                 one_hot_skill=None,
                 nce_dict={},
                 *_args,
                 **_kwargs):
            if Config.REPLAY:
                ob = ob.astype(np.float32)
            a, v, v_i, neglogp = sess.run(
                [a0_run[0], self.vf_run[0], self.vf_run[0], neglogp0_run[0]], {
                    REP_PROC: ob,
                    Z: one_hot_skill
                })
            return a, v, v_i, self.initial_state, neglogp

        def rep_vec(ob, *_args, **_kwargs):
            return sess.run(self.h_pi, {X: ob})

        def value(ob, update_frac, one_hot_skill=None, *_args, **_kwargs):
            return sess.run(self.vf_run, {REP_PROC: ob, Z: one_hot_skill})

        def value_i(ob, update_frac, one_hot_skill=None, *_args, **_kwargs):
            return sess.run(self.vf_run[0], {REP_PROC: ob, Z: one_hot_skill})

        def nce_fw_pass(nce_dict):
            return sess.run([self.vf_i_run, self.rep_loss], nce_dict)

        def custom_train(ob, rep_vecs):
            return sess.run([self.rep_loss], {X: ob, REP_PROC: rep_vecs})[0]

        def compute_codes(ob, act):
            return sess.run([
                tf.reshape(self.codes,
                           (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                tf.reshape(self.u_t, (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                tf.reshape(self.z_t_1,
                           (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                self.h_codes[:, 1:]
            ], {
                REP_PROC: ob,
                self.A_cluster: act
            })

        def compute_hard_codes(ob):
            return sess.run([self.codes, self.u_t, self.z_t_1], {REP_PROC: ob})

        def compute_cluster_returns(returns):
            return sess.run([self.cluster_returns], {self.R_cluster: returns})

        self.X = X
        self.processed_x = processed_x
        self.step = step
        self.value = value
        self.value_i = value_i
        self.rep_vec = rep_vec
        self.custom_train = custom_train
        self.nce_fw_pass = nce_fw_pass
        self.encoder = choose_cnn
        self.REP_PROC = REP_PROC
        self.Z = Z
        self.compute_codes = compute_codes
        self.compute_hard_codes = compute_hard_codes
        self.compute_cluster_returns = compute_cluster_returns
Ejemplo n.º 28
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        self.rep_loss = None
        # explicitly create  vector space for latent vectors
        latent_space = Box(-np.inf, np.inf, shape=(256, ))
        # So that I can compute the saliency map
        if Config.REPLAY:
            X = tf.compat.v1.placeholder(shape=(nbatch, ) + ob_space.shape,
                                         dtype=np.float32,
                                         name='Ob')
            processed_x = X
        else:
            X, processed_x = observation_input(ob_space, None)
            TRAIN_NUM_STEPS = Config.NUM_STEPS // 16
            REP_PROC = tf.compat.v1.placeholder(dtype=tf.float32,
                                                shape=(None, 64, 64, 3),
                                                name='Rep_Proc')
            Z_INT = tf.compat.v1.placeholder(dtype=tf.int32,
                                             shape=(),
                                             name='Curr_Skill_idx')
            Z = tf.compat.v1.placeholder(dtype=tf.float32,
                                         shape=(nbatch, Config.N_SKILLS),
                                         name='Curr_skill')
            CODES = tf.compat.v1.placeholder(dtype=tf.float32,
                                             shape=(1024, Config.N_SKILLS),
                                             name='Train_Codes')
            CLUSTER_DIMS = 256
            HIDDEN_DIMS_SSL = 256
            STEP_BOOL = tf.placeholder(tf.bool, shape=[])
            self.protos = tf.compat.v1.Variable(
                initial_value=tf.random.normal(shape=(CLUSTER_DIMS,
                                                      Config.N_SKILLS)),
                trainable=True,
                name='Prototypes')
            self.A = self.pdtype.sample_placeholder([None], name='A')
            self.R = tf.compat.v1.placeholder(tf.float32, [None], name='R')
            # trajectories of length m, for N policy heads.
            self.STATE = tf.compat.v1.placeholder(tf.float32,
                                                  [None, 64, 64, 3])
            self.STATE_NCE = tf.compat.v1.placeholder(
                tf.float32, [Config.REP_LOSS_M, 1, None, 64, 64, 3])
            self.ANCH_NCE = tf.compat.v1.placeholder(tf.float32,
                                                     [None, 64, 64, 3])
            # labels of Q value quantile bins
            self.LAB_NCE = tf.compat.v1.placeholder(
                tf.float32, [Config.POLICY_NHEADS, None])
            self.A_i = self.pdtype.sample_placeholder(
                [None, Config.REP_LOSS_M, 1], name='A_i')
            self.R_cluster = tf.compat.v1.placeholder(tf.float32, [None])
            self.A_cluster = self.pdtype.sample_placeholder(
                [None, Config.NUM_ENVS], name='A_cluster')

            self.pse_obs_1 = tf.compat.v1.placeholder(tf.float32,
                                                      [None, 64, 64, 3])
            self.pse_actions_1 = self.pdtype.sample_placeholder([None],
                                                                name='A_1')
            self.pse_rewards_1 = tf.compat.v1.placeholder(tf.float32, [None],
                                                          name='R_1')
            self.pse_obs_2 = tf.compat.v1.placeholder(tf.float32,
                                                      [None, 64, 64, 3])
            self.pse_actions_2 = self.pdtype.sample_placeholder([None],
                                                                name='A_2')
            self.pse_rewards_2 = tf.compat.v1.placeholder(tf.float32, [None],
                                                          name='R_2')

        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn(
                processed_x)
            self.train_dropout_assign_ops = fast_dropout_assigned_ops
            self.run_dropout_assign_ops = slow_dropout_assign_ops
            self.h = tf.concat([act_condit, act_invariant], axis=1)
        """
        PSEs code
        """
        contrastive_loss_temperature = Config.TEMP
        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            n_pse = tf.shape(self.pse_obs_1)[0]
            concat_pse_obs = tf.concat([self.pse_obs_1, self.pse_obs_2], 0)
            act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn(
                concat_pse_obs)
            h_pse = tf.concat([act_condit, act_invariant], axis=1)
            representation_1, representation_2 = h_pse[:n_pse], h_pse[n_pse:]
            # PSE loss
            act1 = tf.one_hot(self.pse_actions_1, 15)
            act2 = tf.one_hot(self.pse_actions_2, 15)

            # act1 = tf.reshape(act1,(Config.NUM_ENVS,-1,15))
            # act2 = tf.reshape(act2,(Config.NUM_ENVS,-1,15))

            metric_vals = compute_psm_metric(act1, act2, Config.GAMMA)

            self.contrastive_loss = Config.REP_LOSS_WEIGHT * representation_alignment_loss(
                representation_1,
                representation_2,
                metric_vals,
                use_coupling_weights=True,
                temperature=contrastive_loss_temperature,
                return_representation=False)

        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):

            with tf.compat.v1.variable_scope("head_0",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                self.pd_train = [
                    self.pdtype.pdfromlatent(tf.stop_gradient(self.h),
                                             init_scale=0.01)[0]
                ]

            self.vf_train = [fc(self.h, 'v_0', 1)[:, 0]]

            # Plain Dropout version: Only fast updates / stochastic latent for VIB
            self.pd_run = self.pd_train
            self.vf_run = self.vf_train

            # For Dropout: Always change layer, so slow layer is never used
            self.run_dropout_assign_ops = []

        # Use the current head for classical PPO updates
        a0_run = [
            self.pd_run[head_idx].sample()
            for head_idx in range(Config.POLICY_NHEADS)
        ]
        neglogp0_run = [
            self.pd_run[head_idx].neglogp(a0_run[head_idx])
            for head_idx in range(Config.POLICY_NHEADS)
        ]
        self.initial_state = None

        def step(ob,
                 update_frac,
                 skill_idx=None,
                 one_hot_skill=None,
                 nce_dict={},
                 *_args,
                 **_kwargs):
            if Config.REPLAY:
                ob = ob.astype(np.float32)

            head_idx = 0
            a, v, neglogp = sess.run([
                a0_run[head_idx], self.vf_run[head_idx], neglogp0_run[head_idx]
            ], {X: ob})
            return a, v, self.initial_state, neglogp

        def rep_vec(ob, *_args, **_kwargs):
            return sess.run(self.h, {X: ob})

        def value(ob, update_frac, one_hot_skill=None, *_args, **_kwargs):
            if Config.AGENT == 'ppo_diayn':
                return sess.run(self.vf_run, {X: ob, Z: one_hot_skill})
            elif Config.AGENT == 'ppo_goal':
                return sess.run(self.vf_run, {REP_PROC: ob, Z: one_hot_skill})
            else:
                return sess.run(self.vf_run, {self.STATE: ob, X: ob})

        def value_i(ob, update_frac, one_hot_skill=None, *_args, **_kwargs):
            if Config.AGENT == 'ppo_diayn':
                return sess.run(self.vf_i_run, {X: ob, Z: one_hot_skill})
            elif Config.AGENT == 'ppo_goal':
                return sess.run(self.vf_i_run, {
                    REP_PROC: ob,
                    Z: one_hot_skill
                })
            else:
                return sess.run(self.vf_i_run, {self.STATE: ob, X: ob})

        def nce_fw_pass(nce_dict):
            return sess.run([self.vf_i_run, self.rep_loss], nce_dict)

        def custom_train(ob, rep_vecs):
            return sess.run([self.rep_loss], {X: ob, REP_PROC: rep_vecs})[0]

        def compute_codes(ob, act):
            return sess.run([
                tf.reshape(self.codes,
                           (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                tf.reshape(self.u_t, (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                tf.reshape(self.z_t_1,
                           (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                self.h_codes[:, 1:]
            ], {
                REP_PROC: ob,
                self.A_cluster: act
            })

        def compute_hard_codes(ob):
            return sess.run([self.codes, self.u_t, self.z_t_1], {REP_PROC: ob})

        def compute_cluster_returns(returns):
            return sess.run([self.cluster_returns], {self.R_cluster: returns})

        self.X = X
        self.processed_x = processed_x
        self.step = step
        self.value = value
        self.value_i = value_i
        self.rep_vec = rep_vec
        self.custom_train = custom_train
        self.nce_fw_pass = nce_fw_pass
        self.encoder = choose_cnn
        self.REP_PROC = REP_PROC
        self.Z = Z
        self.compute_codes = compute_codes
        self.compute_hard_codes = compute_hard_codes
        self.compute_cluster_returns = compute_cluster_returns
        self.CODES = CODES
        self.STEP_BOOL = STEP_BOOL
Ejemplo n.º 29
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope("model", reuse=reuse):
            conv1 = caps_cnn(processed_x, **conv_kwargs)
            conv1 = tf.transpose(
                conv1, [0, 3, 1, 2])  # reshape to expected input format
            conv1 = tf.expand_dims(conv1, 1)
            capsule1 = layers.conv_slim_capsule(
                conv1,
                input_dim=1,
                output_dim=32,
                layer_name='conv_capsule1',
                num_routing=1,
                input_atoms=256,
                output_atoms=8,
                stride=2,
                kernel_size=9,
                padding='VALID',
                leaky=False,
            )
            capsule1_atom_last = tf.transpose(capsule1, [0, 1, 3, 4, 2])
            capsule1_3d = tf.reshape(capsule1_atom_last,
                                     [tf.shape(conv1)[0], -1, 8])
            _, _, _, height, width = capsule1.get_shape()
            input_dim1 = 32 * height.value * width.value
            # main encoding layer
            h = layers.capsule(
                input_tensor=capsule1_3d,
                input_dim=input_dim1,
                output_dim=8,
                layer_name='capsule2',
                input_atoms=8,
                output_atoms=16,
                num_routing=3,
                leaky=False,
            )
            # capsule policy layer
            hpi = layers.capsule(
                input_tensor=h,
                input_dim=8,
                output_dim=4,
                layer_name='capsule_pi',
                input_atoms=16,
                output_atoms=4,
                num_routing=3,
                leaky=False,
            )
            pnorm = tf.reduce_sum(tf.square(hpi), axis=-1)
            # value function
            hvf = conv_to_fc(h)
            vf = fc(hvf, 'v', 1)[:, 0]
            # policy based on pnorm (the squared norms of policy capsule vecs)
            self.pd, self.pi = self.pdtype.pdfromflat(pnorm), pnorm

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 30
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        self.rep_loss = None
        # explicitly create  vector space for latent vectors
        latent_space = Box(-np.inf, np.inf, shape=(256, ))
        # So that I can compute the saliency map
        if Config.REPLAY:
            X = tf.compat.v1.placeholder(shape=(nbatch, ) + ob_space.shape,
                                         dtype=np.float32,
                                         name='Ob')
            processed_x = X
        else:
            X, processed_x = observation_input(ob_space, None)
            TRAIN_NUM_STEPS = Config.NUM_STEPS // 16
            REP_PROC = tf.compat.v1.placeholder(dtype=tf.float32,
                                                shape=(None, 64, 64, 3),
                                                name='Rep_Proc')
            Z_INT = tf.compat.v1.placeholder(dtype=tf.int32,
                                             shape=(),
                                             name='Curr_Skill_idx')
            Z = tf.compat.v1.placeholder(dtype=tf.float32,
                                         shape=(nbatch, Config.N_SKILLS),
                                         name='Curr_skill')
            CODES = tf.compat.v1.placeholder(dtype=tf.float32,
                                             shape=(1024, Config.N_SKILLS),
                                             name='Train_Codes')
            CLUSTER_DIMS = 256
            HIDDEN_DIMS_SSL = 256
            STEP_BOOL = tf.placeholder(tf.bool, shape=[])
            self.protos = tf.compat.v1.Variable(
                initial_value=tf.random.normal(shape=(CLUSTER_DIMS,
                                                      Config.N_SKILLS)),
                trainable=True,
                name='Prototypes')
            self.A = self.pdtype.sample_placeholder([None], name='A')
            self.R = tf.compat.v1.placeholder(tf.float32, [None], name='R')
            # trajectories of length m, for N policy heads.
            self.STATE = tf.compat.v1.placeholder(tf.float32,
                                                  [None, 64, 64, 3])
            self.STATE_NCE = tf.compat.v1.placeholder(
                tf.float32, [Config.REP_LOSS_M, 1, None, 64, 64, 3])
            self.ANCH_NCE = tf.compat.v1.placeholder(tf.float32,
                                                     [None, 64, 64, 3])
            # labels of Q value quantile bins
            self.LAB_NCE = tf.compat.v1.placeholder(
                tf.float32, [Config.POLICY_NHEADS, None])
            self.A_i = self.pdtype.sample_placeholder(
                [None, Config.REP_LOSS_M, 1], name='A_i')
            self.R_cluster = tf.compat.v1.placeholder(tf.float32, [None])
            self.A_cluster = self.pdtype.sample_placeholder(
                [None, Config.NUM_ENVS], name='A_cluster')

        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn(
                processed_x)
            self.train_dropout_assign_ops = fast_dropout_assigned_ops
            self.run_dropout_assign_ops = slow_dropout_assign_ops
            self.h = tf.concat([act_condit, act_invariant], axis=1)
        """
        Bisimulation code
        """
        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            # encoder loss
            act_one_hot_target = tf.reshape(tf.one_hot(self.A, ac_space.n),
                                            (-1, ac_space.n))
            pred_next_latent_mu1 = get_transition_model()(tf.concat(
                [self.h, act_one_hot_target], axis=1))
            pred_next_latent_mu2 = shuffle_custom(pred_next_latent_mu1)

            z_dist = tf.reduce_mean(
                tf.compat.v1.losses.huber_loss(
                    self.h,
                    shuffle_custom(self.h),
                    reduction=tf.compat.v1.losses.Reduction.NONE), 1)
            r_dist = tf.compat.v1.losses.huber_loss(
                self.R,
                shuffle_custom(self.R),
                reduction=tf.compat.v1.losses.Reduction.NONE)
            transition_dist = tf.reduce_mean(
                tf.compat.v1.losses.huber_loss(
                    pred_next_latent_mu1,
                    pred_next_latent_mu2,
                    reduction=tf.compat.v1.losses.Reduction.NONE), 1)

            bisimilarity = r_dist + Config.GAMMA * transition_dist
            self.encoder_bisimilarity_loss = tf.reduce_mean(
                tf.math.pow(z_dist - bisimilarity, 2))

            # latent loss
            pred_next_latent_mu1_3d = tf.transpose(
                tf.reshape(pred_next_latent_mu1, [-1, Config.NUM_ENVS, 256]),
                (1, 0, 2))  # 32 x n_timesteps x n_hidden
            h_3d = tf.transpose(tf.reshape(self.h, [-1, Config.NUM_ENVS, 256]),
                                (1, 0, 2))  # 32 x n_timesteps x n_hidden
            pred_next_latent_mu1 = pred_next_latent_mu1_3d[:, :
                                                           -1, :]  # t = 0 to n_timesteps-1
            next_h = h_3d[:, 1:, :]  # t = 1 to n_timesteps
            diff = (pred_next_latent_mu1 - tf.stop_gradient(next_h))
            self.latent_transition_loss = tf.reduce_mean(0.5 *
                                                         tf.math.pow(diff, 2))

        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):

            with tf.compat.v1.variable_scope("head_0",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                self.pd_train = [
                    self.pdtype.pdfromlatent(tf.stop_gradient(self.h),
                                             init_scale=0.01)[0]
                ]

            self.vf_train = [fc(self.h, 'v_0', 1)[:, 0]]

            # Plain Dropout version: Only fast updates / stochastic latent for VIB
            self.pd_run = self.pd_train
            self.vf_run = self.vf_train

            # For Dropout: Always change layer, so slow layer is never used
            self.run_dropout_assign_ops = []

        # Use the current head for classical PPO updates
        a0_run = [
            self.pd_run[head_idx].sample()
            for head_idx in range(Config.POLICY_NHEADS)
        ]
        neglogp0_run = [
            self.pd_run[head_idx].neglogp(a0_run[head_idx])
            for head_idx in range(Config.POLICY_NHEADS)
        ]
        self.initial_state = None

        def step(ob,
                 update_frac,
                 skill_idx=None,
                 one_hot_skill=None,
                 nce_dict={},
                 *_args,
                 **_kwargs):
            if Config.REPLAY:
                ob = ob.astype(np.float32)

            head_idx = 0
            a, v, neglogp = sess.run([
                a0_run[head_idx], self.vf_run[head_idx], neglogp0_run[head_idx]
            ], {X: ob})
            return a, v, self.initial_state, neglogp

        def rep_vec(ob, *_args, **_kwargs):
            return sess.run(self.h, {X: ob})

        def value(ob, update_frac, one_hot_skill=None, *_args, **_kwargs):
            if Config.AGENT == 'ppo_diayn':
                return sess.run(self.vf_run, {X: ob, Z: one_hot_skill})
            elif Config.AGENT == 'ppo_goal':
                return sess.run(self.vf_run, {REP_PROC: ob, Z: one_hot_skill})
            else:
                return sess.run(self.vf_run, {self.STATE: ob, X: ob})

        def value_i(ob, update_frac, one_hot_skill=None, *_args, **_kwargs):
            if Config.AGENT == 'ppo_diayn':
                return sess.run(self.vf_i_run, {X: ob, Z: one_hot_skill})
            elif Config.AGENT == 'ppo_goal':
                return sess.run(self.vf_i_run, {
                    REP_PROC: ob,
                    Z: one_hot_skill
                })
            else:
                return sess.run(self.vf_i_run, {self.STATE: ob, X: ob})

        def nce_fw_pass(nce_dict):
            return sess.run([self.vf_i_run, self.rep_loss], nce_dict)

        def custom_train(ob, rep_vecs):
            return sess.run([self.rep_loss], {X: ob, REP_PROC: rep_vecs})[0]

        def compute_codes(ob, act):
            return sess.run([
                tf.reshape(self.codes,
                           (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                tf.reshape(self.u_t, (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                tf.reshape(self.z_t_1,
                           (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                self.h_codes[:, 1:]
            ], {
                REP_PROC: ob,
                self.A_cluster: act
            })

        def compute_hard_codes(ob):
            return sess.run([self.codes, self.u_t, self.z_t_1], {REP_PROC: ob})

        def compute_cluster_returns(returns):
            return sess.run([self.cluster_returns], {self.R_cluster: returns})

        self.X = X
        self.processed_x = processed_x
        self.step = step
        self.value = value
        self.value_i = value_i
        self.rep_vec = rep_vec
        self.custom_train = custom_train
        self.nce_fw_pass = nce_fw_pass
        self.encoder = choose_cnn
        self.REP_PROC = REP_PROC
        self.Z = Z
        self.compute_codes = compute_codes
        self.compute_hard_codes = compute_hard_codes
        self.compute_cluster_returns = compute_cluster_returns
        self.CODES = CODES
        self.STEP_BOOL = STEP_BOOL
Ejemplo n.º 31
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)

        # So that I can compute the saliency map
        if Config.REPLAY:
            X = tf.placeholder(shape=(nbatch, ) + ob_space.shape,
                               dtype=np.float32,
                               name='Ob')
            processed_x = X
        else:
            X, processed_x = observation_input(ob_space, nbatch)

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            h, h_vf, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn(
                processed_x)
            self.train_dropout_assign_ops = fast_dropout_assigned_ops
            self.run_dropout_assign_ops = slow_dropout_assign_ops

            # Noisy policy and value function for train
            if Config.BETA >= 0:
                pdparam = _matching_fc(h,
                                       'pi',
                                       ac_space.n,
                                       init_scale=1.0,
                                       init_bias=0)
                pdparam = tf.reshape(pdparam,
                                     shape=(Config.NR_SAMPLES, -1, ac_space.n))
                pdparam = tf.transpose(pdparam, perm=[1, 0, 2])

                dists = ds.Categorical(logits=pdparam)
                self.pd_train = ds.MixtureSameFamily(
                    mixture_distribution=ds.Categorical(
                        probs=[1. / Config.NR_SAMPLES] * Config.NR_SAMPLES),
                    components_distribution=dists)
                self.pd_train.neglogp = lambda a: -self.pd_train.log_prob(a)
                self.vf_train = tf.reduce_mean(
                    tf.reshape(fc(h, 'v', 1),
                               shape=(Config.NR_SAMPLES, -1, 1)), 0)[:, 0]
            else:
                self.pd_train, _ = self.pdtype.pdfromlatent(h, init_scale=0.01)
                self.vf_train = fc(h, 'v', 1)[:, 0]

            if Config.SNI:
                assert Config.DROPOUT == 0
                assert not Config.OPENAI
                # Used with VIB: Noiseless pd_run and _both_ value functions
                print("Activating SNI (includes VF)")

                # Use deterministic value function for both as VIB for regression seems like a bad idea
                self.vf_run = self.vf_train = fc(h_vf, 'v', 1)[:, 0]

                # Have a deterministic run policy based on the mean
                self.pd_run, _ = self.pdtype.pdfromlatent(h_vf,
                                                          init_scale=0.01)
            elif Config.SNI2:
                assert not Config.OPENAI
                # Used with Dropout instead of OPENAI modifier
                # 'RUN' versions are updated slowly, train versions updated faster, gradients are mixed
                print("Activating SNI2")

                # Deterministic bootstrap value... doesn't really matter but this is more consistent
                self.vf_run = fc(h_vf, 'v', 1)[:, 0]

                # Run policy based on slow changing latent
                self.pd_run, _ = self.pdtype.pdfromlatent(h_vf,
                                                          init_scale=0.01)
                # Train is updated for each gradient update, slow is only updated once per batch
            elif Config.OPENAI:
                # Completely overwrite train versions as everything changes slowly
                # Train version is same as run version, both of which are slow
                self.pd_run, _ = self.pdtype.pdfromlatent(h_vf,
                                                          init_scale=0.01)
                self.pd_train = self.pd_run
                self.vf_run = self.vf_train = fc(h_vf, 'v', 1)[:, 0]

                # Stochastic version is never used, so can set to ignore
                self.train_dropout_assign_ops = []
            else:
                # Plain Dropout version: Only fast updates / stochastic latent for VIB
                self.pd_run = self.pd_train
                self.vf_run = self.vf_train

                # For Dropout: Always change layer, so slow layer is never used
                self.run_dropout_assign_ops = []

        # Used in step
        a0_run = self.pd_run.sample()
        neglogp0_run = self.pd_run.neglogp(a0_run)
        self.initial_state = None

        def step(ob, update_frac, *_args, **_kwargs):
            if Config.REPLAY:
                ob = ob.astype(np.float32)
            a, v, neglogp = sess.run([a0_run, self.vf_run, neglogp0_run],
                                     {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, update_frac, *_args, **_kwargs):
            return sess.run(self.vf_run, {X: ob})

        self.X = X
        self.processed_x = processed_x
        self.step = step
        self.value = value
Ejemplo n.º 32
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            #
            if USE_COLOR_TRANSFORM:
                out_shape = processed_x.get_shape().as_list()

                mask_vbox = tf.Variable(tf.zeros_like(processed_x, dtype=bool),
                                        trainable=False)
                rh = .2  # hard-coded velocity box size
                # mh = tf.cast(tf.cast(out_shape[1], dtype=tf.float32)*rh, dtype=tf.int32)
                mh = int(out_shape[1] * rh)
                mw = mh * 2
                mask_vbox = mask_vbox[:, :mh, :mw].assign(
                    tf.ones([out_shape[0], mh, mw, out_shape[-1]], dtype=bool))
                masked = tf.where(mask_vbox,
                                  x=tf.zeros_like(processed_x),
                                  y=processed_x)

                # tf.image.adjust_brightness vs. ImageEnhance.Brightness
                # tf version is additive while PIL version is multiplicative
                delta_brightness = tf.get_variable(
                    name='randprocess_brightness',
                    initializer=tf.random_uniform([], -.5, .5),
                    trainable=False)

                # tf.image.adjust_contrast vs. PIL.ImageEnhance.Contrast
                delta_contrast = tf.get_variable(
                    name='randprocess_contrast',
                    initializer=tf.random_uniform([], .5, 1.5),
                    trainable=False,
                )

                # tf.image.adjust_saturation vs. PIL.ImageEnhance.Color
                delta_saturation = tf.get_variable(
                    name='randprocess_saturation',
                    initializer=tf.random_uniform([], .5, 1.5),
                    trainable=False,
                )

                processed_x1 = tf.image.adjust_brightness(
                    masked, delta_brightness)
                processed_x1 = tf.clip_by_value(processed_x1, 0., 255.)
                processed_x1 = tf.where(mask_vbox, x=masked, y=processed_x1)
                processed_x2 = tf.image.adjust_contrast(
                    processed_x1, delta_contrast)
                processed_x2 = tf.clip_by_value(processed_x2, 0., 255.)
                processed_x2 = tf.where(mask_vbox, x=masked, y=processed_x2)
                processed_x3 = tf.image.adjust_saturation(
                    processed_x2, delta_saturation)
                processed_x3 = tf.clip_by_value(processed_x3, 0., 255.)
                processed_x3 = tf.where(mask_vbox,
                                        x=processed_x,
                                        y=processed_x3)
            else:
                processed_x3 = processed_x
            #
            h, self.dropout_assign_ops = choose_cnn(processed_x3)
            vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 33
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False,
                 name='policy', args=None): #pylint: disable=W0613
        policy_variance_state_dependent = args.policy_variance_state_dependent
        ac_fn = args.ac_fn
        hidden_sizes = args.hidden_sizes
        num_sharing_layers = args.num_sharing_layers
        num_layers = args.num_layers
        assert ac_fn in ['tanh', 'sigmoid', 'relu']

        if isinstance(hidden_sizes, int):
            assert num_layers is not None
            hidden_sizes = [hidden_sizes] * num_layers
        if num_layers is None:
            num_layers = len(hidden_sizes)
        assert num_layers == len(hidden_sizes)


        # print(f'Policy hidden_sizes:{hidden_sizes}')

        self.pdtype = make_pdtype(ac_space)

        with tf.variable_scope(name, reuse=reuse):
            X, processed_x = observation_input(ob_space, nbatch)

            activ = getattr( tf.nn, ac_fn )
            processed_x = tf.layers.flatten(processed_x)

            # --- share layers
            for ind_layer in range(num_sharing_layers):
                processed_x = activ( fc(processed_x, f'share_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)) )

            # --- policy
            pi_h = processed_x
            for ind_layer in range( num_sharing_layers, num_layers ):
                pi_h = activ(fc(pi_h, f'pi_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)))

            from gym import spaces
            params_addtional = {}
            if policy_variance_state_dependent and isinstance( ac_space, spaces.Box ):
                latent_logstd = processed_x
                for ind_layer in range(num_sharing_layers, num_layers):
                    latent_logstd = activ(fc(latent_logstd, f'logstd_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)))
                params_addtional['latent_logstd'] = latent_logstd

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h, init_scale=0.01, logstd_initial=args.logstd, **params_addtional)


            # --- value function
            vf_h = processed_x
            for ind_layer in range( num_sharing_layers, num_layers ):
                vf_h = activ(fc(vf_h, f'vf_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)))
            vf = fc(vf_h, 'vf', 1)[:,0]



            a_sample = self.pd.sample()
            neglogp_sample = self.pd.neglogp(a_sample)
            self.initial_state = None


            # --- predict function
            # use placeholder
            # use stochastic action
            # use deterministic action
            if args.coef_predict_task > 0:
                import tensorflow.contrib.distributions as dists
                assert isinstance( ac_space, Box ), 'Only Implement for Box action space'
                A_type = tf.placeholder_with_default('pl', dtype=tf.string)
                A_pl = self.pdtype.sample_placeholder([None])
                self.A = A_pl
                self.A_type = A_type

                A_input_1 = U.switch( tf.equal( A_type, 'det' ), self.pd.mode(), a_sample )
                A_input = U.switch( tf.equal( A_type, 'pl' ), A_pl,A_input_1)
                predict_h = tf.concat( (processed_x, A_input))
                for ind_layer in range(num_sharing_layers, num_layers):
                    predict_h = activ(fc(predict_h, f'predict_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)))
                predict_mean = fc(predict_h, f'predict_fc{ind_layer}', nh=ob_space.shape[0], init_scale=np.sqrt(2))

                predict_cov_init_value = np.identity( shape=ob_space.shape )
                predict_cov = tf.get_variable( name='predict_cov', shape=predict_cov_init_value, initializer=tf.constant_initializer(predict_cov_init_value) )
                predict_dist = dists.MultivariateNormalTriL( predict_mean, predict_cov )
                self.predict_dist = predict_dist

            scope_model = tf.get_variable_scope().name
            self.variables_all = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope_model)
            self.variables_trainable = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope_model)


        #--- set logstd
        # if isinstance( ac_space, Box ):
        # if not policy_variance_state_dependent:
        #     logstd_pl, _ = observation_input( ac_space, batch_size=1, name='ac' )
        #     assign_logstd = tf.assign( self.pdtype.logstd, logstd_pl )
        #     set_logstd_entity = U.function([logstd_pl], assign_logstd)
        #     def set_logstd(logstd_new):
        #         # if isinstance( logstd_new, float  ):
        #         #     logstd_new = [[logstd_new] * ac_space.shape[0]]
        #         set_logstd_entity(logstd_new)
        #     self.set_logstd = set_logstd
        # self.get_logstd = U.function([], self.pdtype.logstd)

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a_sample, vf, neglogp_sample], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        def step_policyflat(ob, *_args, **_kwargs):
            a, v, neglogp, polciyflat = sess.run([a_sample, vf, neglogp_sample, self.pd.flatparam()], {X:ob}) #TODO: TEST flat for discrete action space
            return a, v, self.initial_state, neglogp, polciyflat

        def step_test(ob, *_args, **_kwargs):
            a = sess.run([self.pd.mode()], {X:ob})
            return a

        self.X = X
        self.vf = vf
        self.step = step
        self.step_policyflat = step_policyflat
        self.value = value
        self.step_test = step_test
Ejemplo n.º 34
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        self.rep_loss = None
        # explicitly create  vector space for latent vectors
        latent_space = Box(-np.inf, np.inf, shape=(256, ))
        # So that I can compute the saliency map
        if Config.REPLAY:
            X = tf.compat.v1.placeholder(shape=(nbatch, ) + ob_space.shape,
                                         dtype=np.float32,
                                         name='Ob')
            processed_x = X
        else:
            X, processed_x = observation_input(ob_space, None)
            TRAIN_NUM_STEPS = Config.NUM_STEPS // 16
            REP_PROC = tf.compat.v1.placeholder(dtype=tf.float32,
                                                shape=(None, 64, 64, 3),
                                                name='Rep_Proc')
            Z_INT = tf.compat.v1.placeholder(dtype=tf.int32,
                                             shape=(),
                                             name='Curr_Skill_idx')
            Z = tf.compat.v1.placeholder(dtype=tf.float32,
                                         shape=(None, Config.N_SKILLS),
                                         name='Curr_skill')
            CLUSTER_DIMS = 128
            HIDDEN_DIMS_SSL = 256
            self.protos = tf.compat.v1.Variable(
                initial_value=tf.random.normal(shape=(CLUSTER_DIMS,
                                                      Config.N_SKILLS)),
                trainable=True,
                name='Prototypes')
            self.A = self.pdtype.sample_placeholder([None], name='A')
            # trajectories of length m, for N policy heads.
            self.STATE = tf.compat.v1.placeholder(tf.float32,
                                                  [None, 64, 64, 3])
            self.STATE_NCE = tf.compat.v1.placeholder(
                tf.float32, [Config.REP_LOSS_M, 1, None, 64, 64, 3])
            self.ANCH_NCE = tf.compat.v1.placeholder(tf.float32,
                                                     [None, 64, 64, 3])
            # labels of Q value quantile bins
            self.LAB_NCE = tf.compat.v1.placeholder(
                tf.float32, [Config.POLICY_NHEADS, None])
            self.A_i = self.pdtype.sample_placeholder(
                [None, Config.REP_LOSS_M, 1], name='A_i')
            self.R_cluster = tf.compat.v1.placeholder(tf.float32, [None],
                                                      name='R_cluster')
            self.A_cluster = self.pdtype.sample_placeholder([None],
                                                            name='A_cluster')

        X = REP_PROC  #tf.reshape(REP_PROC, [-1, 64, 64, 3])

        with tf.compat.v1.variable_scope(
                "target" if Config.STOP_GRAD_PPO else "online",
                reuse=tf.compat.v1.AUTO_REUSE):
            act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn(
                X)
            self.train_dropout_assign_ops = fast_dropout_assigned_ops
            self.run_dropout_assign_ops = slow_dropout_assign_ops
        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            self.h = tf.concat([act_condit, act_invariant], axis=1)
        """
        Clustering part
        """

        N_ACTIONS = 5 if Config.ENVIRONMENT == 'ising' else 15

        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            # h_codes: n_batch x n_t x n_rkhs
            act_condit, act_invariant, _, _ = choose_cnn(X)
            self.h_codes = tf.transpose(
                tf.reshape(tf.concat([act_condit, act_invariant], axis=1),
                           [-1, Config.NUM_ENVS, 256]), (1, 0, 2))
            act_one_hot = tf.transpose(
                tf.reshape(tf.one_hot(self.A_cluster, ac_space.n),
                           [-1, Config.NUM_ENVS, ac_space.n]), (1, 0, 2))
            h_acc = []
            h_acc_no_act = []
            for k in range(Config.CLUSTER_T):
                h_t = self.h_codes[:, k:tf.shape(self.h_codes)[1] -
                                   (Config.CLUSTER_T - k - 1)]
                a_t = act_one_hot[:, k:tf.shape(act_one_hot)[1] -
                                  (Config.CLUSTER_T - k - 1)]
                h_t_film = tf.reshape(
                    FiLM(widths=[128], name='FiLM_layer')([
                        tf.expand_dims(
                            tf.expand_dims(tf.reshape(h_t, (-1, 256)), 1), 1),
                        tf.reshape(a_t, (-1, N_ACTIONS))
                    ])[:, 0, 0], (Config.NUM_ENVS, -1, 256))
                h_acc_no_act.append(tf.reshape(h_t,
                                               (Config.NUM_ENVS, -1, 256)))
                h_acc.append(h_t_film)

            # h_seq_no_act = tf.reshape( tf.concat(h_acc_no_act,2), (-1,256*Config.CLUSTER_T))
            h_seq = tf.reshape(tf.concat(h_acc, 2),
                               (-1, 256 * Config.CLUSTER_T))
            self.h_seq = h_seq

            # self.z_t_no_act = get_online_predictor(n_in=256*Config.CLUSTER_T,n_out=CLUSTER_DIMS,prefix='SH_z_pred_no_act')(h_seq_no_act)
            self.z_t = get_online_predictor(n_in=256 * Config.CLUSTER_T,
                                            n_out=CLUSTER_DIMS,
                                            prefix='SH_z_pred')(h_seq)

            self.u_t = get_predictor(n_in=CLUSTER_DIMS,
                                     n_out=CLUSTER_DIMS,
                                     prefix='SH_u_pred')(self.z_t)

        self.z_t_1 = self.z_t
        # scores: n_batch x n_clusters
        # tf.linalg.normalize(self.z_t_1, axis=1, ord='euclidean')[0]
        # tf.linalg.normalize(self.protos, axis=1, ord='euclidean')[0]
        scores = tf.linalg.matmul(
            tf.linalg.normalize(self.z_t_1, axis=1, ord='euclidean')[0],
            tf.linalg.normalize(self.protos, axis=1, ord='euclidean')[0])
        self.codes = sinkhorn(scores=scores)

        self.myow_loss = 0.
        if Config.MYOW:
            """
            MYOW where k-NN neighbors are replaced by Sinkhorn clusters
            """
            # with tf.compat.v1.variable_scope("random", reuse=tf.compat.v1.AUTO_REUSE):
            #     # h_codes: n_batch x n_t x n_rkhs
            #     act_condit_target, act_invariant_target, _, _ = choose_cnn(X)
            #     h_codes_target =  tf.transpose(tf.reshape(tf.concat([act_condit_target, act_invariant_target], axis=1),[-1,Config.NUM_ENVS,256]),(1,0,2))
            #     h_t_target = h_codes_target[:,:-1]
            #     h_tp1_target = h_codes_target[:,1:]

            #     # h_a_t = tf.transpose(tf.reshape(get_predictor(n_in=ac_space.n,n_out=256,prefix="SH_a_emb")( act_one_hot), (-1,Config.NUM_ENVS,256)), (1,0,2))
            #     h_seq_target = tf.reshape( tf.concat([h_t_target,h_tp1_target],2), (-1,256*Config.CLUSTER_T))
            # act_one_hot_target = tf.reshape(tf.one_hot(self.A_cluster,ac_space.n), (-1,ac_space.n))
            # h_seq_target = tf.squeeze(tf.squeeze(FiLM(widths=[512,512], name='FiLM_layer')([tf.expand_dims(tf.expand_dims(h_seq_target,1),1), act_one_hot_target]),1),1)
            y_online = h_seq
            y_target = tf.stop_gradient(h_seq)
            # y_reward = tf.reshape(self.R_cluster,(-1,1))

            # Find cluster adjacency scores
            dist = _compute_distance(tf.transpose(self.protos),
                                     tf.transpose(self.protos))

            k_t = Config.N_KNN
            vals, indx = tf.nn.top_k(-dist, k_t + 1, sorted=True)

            cluster_idx = tf.cast(tf.argmax(scores, 1), tf.int32)

            cluster_membership_list = []
            for i in range(Config.N_SKILLS):
                filter_ = tf.cast(tf.fill(tf.shape(cluster_idx), i), tf.int32)
                mask = tf.math.equal(filter_, cluster_idx)
                cluster_vecs = tf.cast(tf.where(mask), tf.int32)
                cluster_vecs = tf.cond(
                    tf.math.equal(tf.shape(cluster_vecs)[0], 0),
                    lambda: tf.constant([[0]], tf.int32), lambda: cluster_vecs)
                # cluster_idx = tf.cast(tf.round(tf.random.uniform((1,),maxval=tf.cast(tf.shape(cluster_vecs),tf.float32))[0]),tf.int32) # randomly sample a vector from its cluster
                cluster_membership_list.append(
                    cluster_vecs[0]
                )  # take first vector of this cluster as representative
            cluster_membership_list = tf.stack(cluster_membership_list)

            # import ipdb;ipdb.set_trace()

            # N_target = y_target
            with tf.compat.v1.variable_scope("online",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                v_online_net = get_predictor(n_in=256 * Config.CLUSTER_T,
                                             n_out=HIDDEN_DIMS_SSL,
                                             prefix='MYOW_v_pred')
                r_online_net = get_predictor(n_in=HIDDEN_DIMS_SSL,
                                             n_out=HIDDEN_DIMS_SSL,
                                             prefix='MYOW_r_pred')
                v_online = v_online_net(y_online)
                r_online = r_online_net(v_online)
            with tf.compat.v1.variable_scope("target",
                                             reuse=tf.compat.v1.AUTO_REUSE):
                v_target_net = get_predictor(n_in=256 * Config.CLUSTER_T,
                                             n_out=HIDDEN_DIMS_SSL,
                                             prefix='MYOW_v_pred')
                r_target_net = get_predictor(n_in=HIDDEN_DIMS_SSL,
                                             n_out=HIDDEN_DIMS_SSL,
                                             prefix='MYOW_r_pred')

            for k in range(k_t):
                nearby_cluster_idx = tf.gather(indx[:, k + 1], cluster_idx)
                nearby_batch_vecs = tf.reshape(
                    tf.gather(cluster_membership_list,
                              tf.cast(nearby_cluster_idx, tf.int32)), (-1, ))
                N_target = tf.gather(y_target, nearby_batch_vecs)
                v_target = v_target_net(N_target)
                # r_target = r_target_net(v_target)

                self.myow_loss += tf.reduce_mean(cos_loss(
                    r_online,
                    v_target))  #+ tf.reduce_mean(cos_loss(r_target, v_online))

            # with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE):
            #     phi_s = get_online_predictor(n_in=256,n_out=CLUSTER_DIMS,prefix='SH_z_pred')(tf.reshape(h_acc[-1],(-1,256)))
            #     self.myow_loss += tf.reduce_mean(cos_loss(phi_s, tf.transpose(tf.gather(self.protos,cluster_idx,axis=1),(1,0)) ))
        """
        Intrinsic rewards
        """
        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            self.R_I_SCALE = tf.nn.relu(
                get_linear_layer(n_in=256,
                                 n_out=1,
                                 prefix='r_i_scale',
                                 init=initializers.RandomNormal(stddev=0.11))(
                                     tf.reshape(tf.stop_gradient(h_acc[-1]),
                                                (-1, 256))))

            # self.h = get_predictor(n_in=256+Config.N_SKILLS,n_out=256)(tf.concat([self.h,tf.stop_gradient(scores)],1))
        """
        Condition on soft-cluster assignments for policy head (Cluster Conditioned Policy )
        """
        if Config.CLUSTER_CONDIT_POLICY:
            concat_code = tf.stop_gradient(
                tf.reshape(self.codes, [-1, Config.N_SKILLS]))
            # print(self.h)
            # print(concat_code)
            #self.h = tf.concat([self.h, concat_code], axis=1)
            #h_seq = tf.squeeze(tf.squeeze(FiLM(widths=[512,512], name='FiLM_layer')([tf.expand_dims(tf.expand_dims(h_seq,1),1), act_one_hot]),1),1)

        with tf.compat.v1.variable_scope("online",
                                         reuse=tf.compat.v1.AUTO_REUSE):
            if Config.CUSTOM_REP_LOSS and Config.POLICY_NHEADS > 1:
                self.pd_train = []
                for i in range(Config.POLICY_NHEADS):
                    with tf.compat.v1.variable_scope(
                            "head_" + str(i), reuse=tf.compat.v1.AUTO_REUSE):
                        self.pd_train.append(
                            self.pdtype.pdfromlatent(self.h,
                                                     init_scale=0.01)[0])
                with tf.compat.v1.variable_scope(
                        "head_i", reuse=tf.compat.v1.AUTO_REUSE):
                    self.pd_train_i = self.pdtype.pdfromlatent(
                        self.h, init_scale=0.01)[0]
            else:
                with tf.compat.v1.variable_scope(
                        "head_0", reuse=tf.compat.v1.AUTO_REUSE):
                    self.pd_train = [
                        self.pdtype.pdfromlatent(self.h, init_scale=0.01)[0]
                    ]

            if Config.CUSTOM_REP_LOSS and Config.POLICY_NHEADS > 1:
                # self.vf_train = [fc(self.h, 'v'+str(i), 1)[:, 0] for i in range(Config.POLICY_NHEADS)]
                self.vf_train = [fc(self.h, 'v_0', 1)[:, 0]]
            else:
                self.vf_train = [fc(self.h, 'v_0', 1)[:, 0]]
            self.vf_i_train = fc(tf.stop_gradient(self.h), 'v_i', 1)[:, 0]
            self.vf_i_run = self.vf_i_train

            # Plain Dropout version: Only fast updates / stochastic latent for VIB
            self.pd_run = self.pd_train
            self.vf_run = self.vf_train

            # For Dropout: Always change layer, so slow layer is never used
            self.run_dropout_assign_ops = []

        # Use the current head for classical PPO updates
        a0_run = [
            self.pd_run[head_idx].sample()
            for head_idx in range(Config.POLICY_NHEADS)
        ]
        neglogp0_run = [
            self.pd_run[head_idx].neglogp(a0_run[head_idx])
            for head_idx in range(Config.POLICY_NHEADS)
        ]
        self.initial_state = None

        def step(ob,
                 update_frac,
                 skill_idx=None,
                 one_hot_skill=None,
                 nce_dict={},
                 *_args,
                 **_kwargs):
            if Config.REPLAY:
                ob = ob.astype(np.float32)
            a, v, v_i, neglogp = sess.run(
                [a0_run[0], self.vf_run[0], self.vf_i_run, neglogp0_run[0]], {
                    REP_PROC: ob,
                    Z: one_hot_skill
                })
            return a, v, v_i, self.initial_state, neglogp

        def rep_vec(ob, *_args, **_kwargs):
            return sess.run(self.h, {X: ob})

        def value(ob, update_frac, one_hot_skill=None, *_args, **_kwargs):
            return sess.run(self.vf_run, {REP_PROC: ob, Z: one_hot_skill})

        def value_i(ob, update_frac, one_hot_skill=None, *_args, **_kwargs):
            return sess.run(self.vf_i_run, {REP_PROC: ob, Z: one_hot_skill})

        def nce_fw_pass(nce_dict):
            return sess.run([self.vf_i_run, self.rep_loss], nce_dict)

        def custom_train(ob, rep_vecs):
            return sess.run([self.rep_loss], {X: ob, REP_PROC: rep_vecs})[0]

        def compute_codes(ob, act):
            return sess.run([
                tf.reshape(self.codes,
                           (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                tf.reshape(self.u_t, (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                tf.reshape(self.z_t_1,
                           (Config.NUM_ENVS, Config.NUM_STEPS, -1)),
                self.h_codes[:, 1:]
            ], {
                REP_PROC: ob,
                self.A_cluster: act
            })

        def compute_hard_codes(ob):
            return sess.run([self.codes, self.u_t, self.z_t_1], {REP_PROC: ob})

        def compute_cluster_returns(returns):
            return sess.run([self.cluster_returns], {self.R_cluster: returns})

        self.X = X
        self.processed_x = processed_x
        self.step = step
        self.value = value
        self.value_i = value_i
        self.rep_vec = rep_vec
        self.custom_train = custom_train
        self.nce_fw_pass = nce_fw_pass
        self.encoder = choose_cnn
        self.REP_PROC = REP_PROC
        self.Z = Z
        self.compute_codes = compute_codes
        self.compute_hard_codes = compute_hard_codes
        self.compute_cluster_returns = compute_cluster_returns