def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=reuse): h = ppo_cnn_model(processed_x) v = tf.layers.dense(h, 1, name='v') vf = tf.squeeze(v, axis=[1]) self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False): """ Policy object for A2C :param sess: (TensorFlow session) The current TensorFlow session :param ob_space: (Gym Space) The observation space of the environment :param ac_space: (Gym Space) The action space of the environment :param n_batch: (int) The number of batch to run (n_envs * n_steps) :param n_steps: (int) The number of steps to run for each environment :param n_lstm: (int) The number of LSTM cells (for reccurent policies) :param reuse: (bool) If the policy is reusable or not """ self.n_env = n_batch // n_steps self.obs_ph, self.processed_x = observation_input(ob_space, n_batch) self.masks_ph = tf.placeholder(tf.float32, [n_batch]) # mask (done t-1) self.states_ph = tf.placeholder(tf.float32, [self.n_env, n_lstm * 2]) # states self.pdtype = make_proba_dist_type(ac_space) self.sess = sess self.reuse = reuse
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): X, processed_x = observation_input(ob_space, nbatch) activ = tf.tanh flatten = tf.layers.flatten pi_h1 = activ( fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) vf_h1 = activ( fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(processed_x, **conv_kwargs) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, name='policy', **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope(name, reuse=reuse): h = nature_cnn(processed_x, **conv_kwargs) vf = fc(h, 'v', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def step_test(ob, *_args, **_kwargs): a, v, neglogp = sess.run([tf.argmax(self.pd.logits, axis=-1), vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def step_policyflat(ob, *_args, **_kwargs): a, v, neglogp, polciyflat = sess.run([a0, vf, neglogp0, self.pd.logits], {X:ob}) # a, v, self.initial_state, neglogp = self.step(ob, *_args, **_kwargs) # pa = np.exp(-neglogp) return a, v, self.initial_state, neglogp, polciyflat def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.step_test = step_test self.step_policyflat = step_policyflat self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): X, processed_x = observation_input(ob_space, nbatch) activ = tf.tanh processed_x = tf.layers.flatten(processed_x) pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) # policy pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) # value function vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:, 0] # pdtype-概率分布的参数化族 self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) # 根据action space创建相应的参数化分布.如这里action space是Discrete(4),那分布 # 就是CategoricalPdType().然后根据该分布类型,结合网络输出(pi),得到动作概率分 # 布CategoricalPd,最后在该分布上采样,得到动作a0.而neglogp0即为该动作的自信息量 a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): h, self.dropout_assign_ops = choose_cnn(processed_x) vf = fc(h, 'v', 1)[:, 0] lp = fc(h, 'lp', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, l, neglogp = sess.run([a0, vf, lp, neglogp0], {X: ob}) return a, v, l, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.lp = lp self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, arch='impala', use_batch_norm=True, dropout=0, **conv_kwargs): self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) scaled_images = tf.cast(processed_x, tf.float32) / 255. with tf.variable_scope("model", reuse=tf.AUTO_REUSE): h, self.dropout_assign_ops = random_impala_cnn( scaled_images, use_batch_norm=use_batch_norm, dropout=dropout) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): clean_h, _ = impala_cnn(scaled_images, use_batch_norm=use_batch_norm, dropout=dropout) clean_vf = fc(clean_h, 'v', 1)[:, 0] self.clean_pd, self.clean_pi = self.pdtype.pdfromlatent( clean_h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) clean_a0 = self.clean_pd.sample() clean_neglogp0 = self.clean_pd.neglogp(clean_a0) self.initial_state = None def step(ob, clean_flag, *_args, **_kwargs): a, v, neglogp, c_a, c_v, c_neglogp \ = sess.run([a0, vf, neglogp0, clean_a0, clean_vf, clean_neglogp0], {X:ob}) if clean_flag: return c_a, c_v, self.initial_state, c_neglogp else: return a, v, self.initial_state, neglogp def value(ob, clean_flag, *_args, **_kwargs): v, c_v = sess.run([vf, clean_vf], {X: ob}) if clean_flag: return c_v else: return v self.X = X self.H = h self.CH = clean_h self.vf = vf self.clean_vf = clean_vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): X, processed_x = observation_input(ob_space, nbatch) activ = tf.tanh processed_x = tf.layers.flatten(processed_x) pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) #X, processed_x = observation_input(ob_space, nbatch) X, processed_x = observation_input(ob_space, None) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(processed_x, **conv_kwargs) self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None self.entropy = cat_entropy(self.pi) def step(ob, *_args, **_kwargs): a, neglogp = sess.run([a0, neglogp0], {X:ob}) return a, self.initial_state, neglogp #def value(ob, *_args, **_kwargs): # return sess.run(vf, {X:ob}) def neg_log_prob(actions): return tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.pi, labels=actions) self.X = X self.step = step #self.value = value self.neg_log_prob = neg_log_prob
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) scaled_images = tf.cast(processed_x, tf.float32) / 255. mc_index = tf.placeholder(tf.int64, shape=[1], name='mc_index') with tf.variable_scope("model", reuse=tf.AUTO_REUSE): h, self.dropout_assign_ops = random_impala_cnn(scaled_images) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): clean_h, _ = impala_cnn(scaled_images) clean_vf = fc(clean_h, 'v', 1)[:, 0] self.clean_pd, self.clean_pi = self.pdtype.pdfromlatent( clean_h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) clean_a0 = self.clean_pd.sample() clean_neglogp0 = self.clean_pd.neglogp(clean_a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) def step_with_clean(flag, ob, *_args, **_kwargs): a, v, neglogp, c_a, c_v, c_neglogp \ = sess.run([a0, vf, neglogp0, clean_a0, clean_vf, clean_neglogp0], {X:ob}) if flag: return c_a, c_v, self.initial_state, c_neglogp else: return a, v, self.initial_state, neglogp def value_with_clean(flag, ob, *_args, **_kwargs): v, c_v = sess.run([vf, clean_vf], {X: ob}) if flag: return c_v else: return v self.X = X self.H = h self.CH = clean_h self.vf = vf self.clean_vf = clean_vf self.step = step self.value = value self.step_with_clean = step_with_clean self.value_with_clean = value_with_clean
def __init__(self, observation_space, name=None): """ Creates an input placeholder tailored to a specific observation space :param observation_space: (Gym Space) observation space of the environment. Should be one of the gym.spaces types :param name: (str) tensorflow name of the underlying placeholder """ inpt, self.processed_inpt = observation_input(observation_space, name=name) super().__init__(inpt)
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) img_X, processed_img_x = observation_input(ob_space[0], nbatch) vec_X, processed_vec_x = observation_input(ob_space[1], nbatch) with tf.variable_scope("model", reuse=reuse): # img feature extractor img_h = vgg19_cnn(processed_img_x, **conv_kwargs) # vec feature extractor activ = tf.nn.relu vec_h1 = activ(fc(processed_vec_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) vec_h = activ(fc(vec_h1, 'pi_fc2', nh=128, init_scale=np.sqrt(2))) # feature concat h = tf.concat([img_h,vec_h],1) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): img_x_input = ob[0] vec_x_input = ob[1] a, v, neglogp = sess.run([a0, vf, neglogp0], {img_X:img_x_input,vec_X:vec_x_input}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): img_x_input = ob[0] vec_x_input = ob[1] return sess.run(vf, {img_X:img_x_input,vec_X:vec_x_input}) self.img_X = img_X self.vec_X = vec_X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, create_additional=True, nlstm=256): nenv = nbatch // nsteps self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=tf.AUTO_REUSE): h, self.dropout_assign_ops = choose_cnn(processed_x) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) if (create_additional): vf = fc(h5, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h5) a0 = self.pd.sample() if (create_additional): neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): if (create_additional): a, v, s, neglogp = sess.run([a0, vf, snew, neglogp0], { X: ob, S: state, M: mask }) else: a, s = sess.run([a0, snew], {X: ob, S: state, M: mask}) v = np.zeros_like(a) neglogp = np.zeros_like(a) return a, v, s, neglogp def value(ob, state, mask): return sess.run(vf, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S if (create_additional): self.vf = vf self.value = value self.step = step
def __init__(self, observation_space, name=None, extra_channels=0): """Creates an input placeholder tailored to a specific observation space Parameters ---------- observation_space: observation space of the environment. Should be one of the gym.spaces types name: str tensorflow name of the underlying placeholder """ inpt, self.processed_inpt = observation_input(observation_space, name=name, extra_channels=extra_channels) super().__init__(inpt)
def __init__(self, observation_space, name=None): """Creates an input placeholder tailored to a specific observation space Parameters ---------- observation_space: observation space of the environment. Should be one of the gym.spaces types name: str tensorflow name of the underlying placeholder """ inpt, self.processed_inpt = observation_input(observation_space, name=name) super().__init__(inpt)
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=tf.AUTO_REUSE, policy_scope='', value_scope=''): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model" + policy_scope, reuse=tf.AUTO_REUSE): X, processed_x = observation_input(ob_space, nbatch) print(X) activ = tf.tanh processed_x = tf.layers.flatten(processed_x) pi_h1 = activ( fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) self.pd, self.pi = self.pdtype.pdfromlatent( pi_h2, init_scale=0.01) # pd->probability distribution; pi->policy with tf.variable_scope("model" + value_scope, reuse=tf.AUTO_REUSE): vf_h1 = activ( fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) def value_pi(ob, *_args, **_kwargs): pass def neg_log_prob(actions): return self.pd.neglogp(actions) self.X = X self.vf = vf self.step = step self.value = value self.neg_log_prob = neg_log_prob self.entropy = self.pd.entropy()
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, create_additional=True, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): h, self.dropout_assign_ops = choose_cnn(processed_x) if create_additional: vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() if (create_additional): neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): if create_additional: a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) else: a = sess.run(a0, {X: ob}) v = np.zeros_like(a) neglogp = np.zeros_like(a) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X if create_additional: self.vf = vf self.value = value self.step = step
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, arch='impala', use_batch_norm=True, dropout=0, **conv_kwargs): self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): processed_x3 = processed_x h, self.dropout_assign_ops = choose_cnn( processed_x3, arch=arch, use_batch_norm=use_batch_norm, dropout=dropout) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, clean_flag, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, clean_flag, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(processed_x, **conv_kwargs) vf = fc(h, 'v', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): h, self.dropout_assign_ops = choose_cnn(processed_x) with tf.variable_scope("policy", reuse=tf.AUTO_REUSE): vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) # Dipam: Add discrimiator network on h with tf.variable_scope("discriminator", reuse=tf.AUTO_REUSE): discfc1 = tf.nn.tanh(fc(h, 'discL1', 100)) disc_logits = fc(discfc1, 'disc', 2) #probd = self.pd.flatparam() #greedyaction = self.pd.mode() a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): #a, v, neglogp, pdout, a_greedy = sess.run([a0, vf, neglogp0, probd, greedyaction], {X:ob}) a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) #return a, v, self.initial_state, neglogp, pdout, a_greedy return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value self.disc_logits = disc_logits
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) print(processed_x) with tf.variable_scope("model", reuse=reuse): conv = Conv2D(64, (3, 3), padding='same')(processed_x) conv = Conv2D(32, (3, 3), padding='same')(conv) flat = Flatten()(conv) dense = Dense(100, activation='relu')(flat) vf = Dense(1)(dense) self.pd, self.pi = self.pdtype.pdfromlatent(dense) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) # v = np.array(v) return a, v[:, 0], self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob})[:, 0] self.X = X self.vf = vf self.step = step self.value = value
# renormalization for continual or (especially!) transfer learning. # Attention: # Attentive RNN image recognition less susceptible to adversarial prtbs? # Can working memory shape modular structure of network? # RRNN (Recursive Recurrent NNs): # like capsule network but more flexible # reusable capsules; dynamically determined depth # however, I like the idea of activation as feature existence probablty # Q: could these be good for transfer learning? Maybe. # Tensor valued working memories (generalizing NTM) # List of job application plans # OpenAI fellowship # AI2 software engineer # AI for Brain Science -- look up positions # Google DeepMind -- research engineer X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask self.pdtype = make_pdtype(ac_space) with tf.variable_scope('model', reuse=reuse): h = caps_cnn(processed_x) h = capsule_conv(h, 'capsconv', 4, 2, 32, 8) h = capsule(h, 'caps', 16, 8, from_conv=True) vf = fc(h, 'v', 1)[:, 0] # value function # for discrete action spaces, create a final capsule layer # one capsule for each possible action if isinstance(ac_space, spaces.Discrete): p = capsule(h, 'pcaps', ac_space.n, 4, from_conv=False) pnorm = tf.reduce_sum(tf.square(p), axis=2) self.pd, self.pi = self.pdtype.pdfromflat(pnorm), pnorm else: self.pd, self.pi = self.pdtype.pdfromlatent(h)
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) # explicitly create vector space for latent vectors latent_space = Box(-np.inf, np.inf, shape=(256, )) # So that I can compute the saliency map if Config.REPLAY: X = tf.placeholder(shape=(nbatch, ) + ob_space.shape, dtype=np.float32, name='Ob') processed_x = X # create placeholders for custom loss ANCHORS = tf.placeholder(shape=(nbatch, ) + ob_space.shape, dtype=np.float32, name='anch') POST_TRAJ = tf.placeholder(shape=(nbatch, ) + ob_space.shape, dtype=np.float32, name='post_traj') NEG_TRAJ = tf.placeholder(shape=(nbatch, ) + ob_space.shape, dtype=np.float32, name='neg_traj') else: X, processed_x = observation_input(ob_space, nbatch) ANCHORS, PROC_ANCH = observation_input(ob_space, Config.REP_LOSS_M * Config.NUM_ENVS, name='anch') POST_TRAJ, PROC_POS = observation_input(ob_space, Config.REP_LOSS_M * Config.NUM_ENVS, name='pos_traj') NEG_TRAJ, PROC_NEG = observation_input( ob_space, Config.REP_LOSS_M * Config.NUM_ENVS * Config.NEGS, name='neg_traj') print('bob', ob_space) print(type(ob_space)) AVG_REPS, AVG_REP_PROC = observation_input(latent_space, nbatch) # observation input with tf.variable_scope("model", reuse=tf.compat.v1.AUTO_REUSE): act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn( processed_x) self.train_dropout_assign_ops = fast_dropout_assigned_ops self.run_dropout_assign_ops = slow_dropout_assign_ops # stack together action invariant & conditioned layers for full representation layer self.h = tf.concat([act_condit, act_invariant], axis=1) # concat average phi vector self.h_avg = tf.concat([self.h, AVG_REP_PROC], axis=1) self.h_vf = self.h_avg # NOTE: (Ahmed) I commented out all the IBAC-SNI settings to make this easier to read # since we shouldn't be using any of these settings anyway. # Noisy policy and value function for train # if Config.BETA >= 0: # pdparam = _matching_fc(self.h, 'pi', ac_space.n, init_scale=1.0, init_bias=0) # pdparam = tf.reshape(pdparam, shape=(Config.NR_SAMPLES, -1, ac_space.n)) # pdparam = tf.transpose(pdparam, perm=[1,0,2]) # dists = ds.Categorical(logits=pdparam) # self.pd_train = ds.MixtureSameFamily( # mixture_distribution=ds.Categorical(probs=[1./Config.NR_SAMPLES]*Config.NR_SAMPLES), # components_distribution=dists) # self.pd_train.neglogp = lambda a: - self.pd_train.log_prob(a) # self.vf_train = tf.reduce_mean(tf.reshape(fc(self.h, 'v', 1), shape=(Config.NR_SAMPLES, -1, 1)), 0)[:, 0] # else: self.pd_train, _ = self.pdtype.pdfromlatent(self.h_avg, init_scale=0.01) self.vf_train = fc(self.h, 'v', 1)[:, 0] # if Config.SNI: # assert Config.DROPOUT == 0 # assert not Config.OPENAI # # Used with VIB: Noiseless pd_run and _both_ value functions # print("Activating SNI (includes VF)") # # Use deterministic value function for both as VIB for regression seems like a bad idea # self.vf_run = self.vf_train = fc(self.h_vf, 'v', 1)[:, 0] # # Have a deterministic run policy based on the mean # self.pd_run, _ = self.pdtype.pdfromlatent(self.h_vf, init_scale=0.01) # elif Config.SNI2: # assert not Config.OPENAI # # Used with Dropout instead of OPENAI modifier # # 'RUN' versions are updated slowly, train versions updated faster, gradients are mixed # print("Activating SNI2") # # Deterministic bootstrap value... doesn't really matter but this is more consistent # self.vf_run = fc(h_vf, 'v', 1)[:, 0] # # Run policy based on slow changing latent # self.pd_run, _ = self.pdtype.pdfromlatent(h_vf, init_scale=0.01) # # Train is updated for each gradient update, slow is only updated once per batch # elif Config.OPENAI: # # Completely overwrite train versions as everything changes slowly # # Train version is same as run version, both of which are slow # self.pd_run, _ = self.pdtype.pdfromlatent(h_vf, init_scale=0.01) # self.pd_train = self.pd_run # self.vf_run = self.vf_train = fc(h_vf, 'v', 1)[:, 0] # # Stochastic version is never used, so can set to ignore # self.train_dropout_assign_ops = [] # else: # Plain Dropout version: Only fast updates / stochastic latent for VIB self.pd_run = self.pd_train self.vf_run = self.vf_train # For Dropout: Always change layer, so slow layer is never used self.run_dropout_assign_ops = [] # Old aidl version # with tf.variable_scope("model", reuse=True) as scope: # y = tf.constant([1.0, 0.0]) # _, anchor_rep, _, _ = choose_cnn(PROC_ANCH) # _, pos_rep, _, _ = choose_cnn(PROC_POS) # _, neg_rep, _, _ = choose_cnn(PROC_NEG) # # (num_envs, m, nodes) # anchor_rep = tf.reshape(anchor_rep, [Config.NUM_ENVS, Config.REP_LOSS_M, -1]) # pos_rep = tf.reshape(pos_rep, [Config.NUM_ENVS, Config.REP_LOSS_M, -1]) # # (neg_samples, num_envs, m, nodes) # neg_rep = tf.reshape(neg_rep, [Config.NEGS, Config.NUM_ENVS, Config.REP_LOSS_M, -1]) # # (num_envs, m) multiply all representation layers across envs, and trajectories # pos_matr = tf.einsum('aij,aij->ai', anchor_rep, pos_rep) # # logit for positive sample and anchor # pos_logit = tf.expand_dims(tf.reduce_mean(pos_matr), axis=0) # # (neg_samples, num_envs, m) multiply all representation layers across envs, and trajectories # # for each negative sample # neg_matr = tf.einsum('aij,kaij->kai', anchor_rep, neg_rep) # # get average over negative samples to find logits # neg_logits = tf.math.reduce_mean(neg_matr, axis=(1, 2)) # # TODO put back in tanh clamping in case things get unstable with InfoNCE # logits = tf.concat([pos_logit, neg_logits], axis=0) # # bce = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logit) # # loss is negative of first logit, which is positive samp/ anchor # neg_probs = tf.math.negative(tf.nn.log_softmax(logits)) # self.rep_loss = neg_probs[0]*Config.REP_LOSS_WEIGHT*-1 # with tf.variable_scope("model", reuse=tf.compat.v1.AUTO_REUSE): # params = tf.trainable_variables() # # Apply custom loss # trainer = None # if Config.SYNC_FROM_ROOT: # trainer = MpiAdamOptimizer(MPI.COMM_WORLD, epsilon=1e-5) # else: # trainer = tf.train.AdamOptimizer( epsilon=1e-5) # rep_params = params[:-6] # grads_and_var = trainer.compute_gradients(self.rep_loss, rep_params) # grads, var = zip(*grads_and_var) # if max_grad_norm is not None: # grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) # grads_and_var = list(zip(grads, var)) # _custtrain = trainer.apply_gradients(grads_and_var) # Used in step a0_run = self.pd_run.sample() neglogp0_run = self.pd_run.neglogp(a0_run) self.initial_state = None def step(ob, phi_bar, update_frac, *_args, **_kwargs): if Config.REPLAY: ob = ob.astype(np.float32) a, v, neglogp = sess.run([a0_run, self.vf_run, neglogp0_run], { X: ob, AVG_REP_PROC: phi_bar }) return a, v, self.initial_state, neglogp def rep_vec(ob, *_args, **_kwargs): return sess.run(self.h, {X: ob}) def value(ob, update_frac, *_args, **_kwargs): return sess.run(self.vf_run, {X: ob}) def custom_train(anchors, pos_traj, neg_traj): return sess.run([self.rep_loss, _custtrain], { ANCHORS: anchors, POST_TRAJ: pos_traj, NEG_TRAJ: neg_traj })[:-1] self.X = X self.ANCHORS = ANCHORS self.POST_TRAJ = POST_TRAJ self.NEG_TRAJ = NEG_TRAJ self.processed_x = processed_x self.step = step self.value = value self.custom_train = custom_train self.rep_vec = rep_vec self.AVG_REP_PROC = AVG_REP_PROC
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) self.rep_loss = None # explicitly create vector space for latent vectors latent_space = Box(-np.inf, np.inf, shape=(256, )) # So that I can compute the saliency map if Config.REPLAY: X = tf.compat.v1.placeholder(shape=(nbatch, ) + ob_space.shape, dtype=np.float32, name='Ob') processed_x = X else: X, processed_x = observation_input(ob_space, None) TRAIN_NUM_STEPS = Config.NUM_STEPS // 16 REP_PROC = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, 64, 64, 3), name='Rep_Proc') Z_INT = tf.compat.v1.placeholder(dtype=tf.int32, shape=(), name='Curr_Skill_idx') Z = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, Config.N_SKILLS), name='Curr_skill') CLUSTER_DIMS = 128 HIDDEN_DIMS_SSL = 256 self.protos = tf.compat.v1.Variable( initial_value=tf.random.normal(shape=(CLUSTER_DIMS, Config.N_SKILLS)), trainable=True, name='Prototypes') self.A = self.pdtype.sample_placeholder([None], name='A') # trajectories of length m, for N policy heads. self.STATE = tf.compat.v1.placeholder(tf.float32, [None, 64, 64, 3]) self.STATE_NCE = tf.compat.v1.placeholder( tf.float32, [Config.REP_LOSS_M, 1, None, 64, 64, 3]) self.ANCH_NCE = tf.compat.v1.placeholder(tf.float32, [None, 64, 64, 3]) # labels of Q value quantile bins self.LAB_NCE = tf.compat.v1.placeholder( tf.float32, [Config.POLICY_NHEADS, None]) self.A_i = self.pdtype.sample_placeholder( [None, Config.REP_LOSS_M, 1], name='A_i') self.R_cluster = tf.compat.v1.placeholder(tf.float32, [None], name='R_cluster') self.A_cluster = self.pdtype.sample_placeholder([None], name='A_cluster') X = REP_PROC #tf.reshape(REP_PROC, [-1, 64, 64, 3]) with tf.compat.v1.variable_scope("target", reuse=tf.compat.v1.AUTO_REUSE): with tf.compat.v1.variable_scope("value", reuse=tf.compat.v1.AUTO_REUSE): act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn( X) with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): with tf.compat.v1.variable_scope("value", reuse=tf.compat.v1.AUTO_REUSE): self.h_v = tf.concat([act_condit, act_invariant], axis=1) with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): with tf.compat.v1.variable_scope("policy", reuse=tf.compat.v1.AUTO_REUSE): act_condit_pi, act_invariant_pi, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn( X) self.train_dropout_assign_ops = fast_dropout_assigned_ops self.run_dropout_assign_ops = slow_dropout_assign_ops with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): with tf.compat.v1.variable_scope("policy", reuse=tf.compat.v1.AUTO_REUSE): self.h_pi = tf.concat([act_condit_pi, act_invariant_pi], axis=1) act_one_hot = tf.reshape(tf.one_hot(self.A, ac_space.n), (-1, ac_space.n)) self.adv_pi = get_linear_layer(n_in=256 + 15, n_out=1)( tf.concat([self.h_pi, act_one_hot], axis=1)) self.v_pi = get_linear_layer(n_in=256, n_out=1)(self.h_pi) """ Clustering part """ with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): with tf.compat.v1.variable_scope("value", reuse=tf.compat.v1.AUTO_REUSE): # h_codes: n_batch x n_t x n_rkhs act_condit, act_invariant, _, _ = choose_cnn(X) self.h_codes = tf.transpose( tf.reshape(tf.concat([act_condit, act_invariant], axis=1), [-1, Config.NUM_ENVS, 256]), (1, 0, 2)) act_one_hot = tf.transpose( tf.reshape(tf.one_hot(self.A_cluster, ac_space.n), [-1, Config.NUM_ENVS, ac_space.n]), (1, 0, 2)) h_acc = [] for k in range(Config.CLUSTER_T): h_t = self.h_codes[:, k:tf.shape(self.h_codes)[1] - (Config.CLUSTER_T - k - 1)] a_t = act_one_hot[:, k:tf.shape(act_one_hot)[1] - (Config.CLUSTER_T - k - 1)] h_t = tf.reshape( FiLM(widths=[128], name='FiLM_layer')([ tf.expand_dims( tf.expand_dims(tf.reshape(h_t, (-1, 256)), 1), 1), tf.reshape(a_t, (-1, 15)) ])[:, 0, 0], (Config.NUM_ENVS, -1, 256)) h_acc.append(h_t) h_seq = tf.reshape(tf.concat(h_acc, 2), (-1, 256 * Config.CLUSTER_T)) self.z_t = get_online_predictor(n_in=256 * Config.CLUSTER_T, n_out=CLUSTER_DIMS, prefix='SH_z_pred')(h_seq) self.u_t = get_predictor(n_in=CLUSTER_DIMS, n_out=CLUSTER_DIMS, prefix='SH_u_pred')(self.z_t) self.z_t_1 = self.z_t # scores: n_batch x n_clusters scores = tf.linalg.matmul( tf.linalg.normalize(self.z_t_1, axis=1, ord='euclidean')[0], tf.linalg.normalize(self.protos, axis=1, ord='euclidean')[0]) self.codes = sinkhorn(scores=scores) if Config.MYOW: """ Compute average cluster reward 1/N_i \sum_{C_i} V^pi(s_j) TODO: mine nearby representations of [st,stp1] with [st,at,stp1]? these two should be close if transitions are deterministic """ cluster_idx = tf.argmax(scores, 1) if False: reward_scale = [] for i in range(Config.N_SKILLS): filter_ = tf.cast(tf.fill(tf.shape(self.R_cluster), i), tf.float32) mask = tf.cast(tf.math.equal(filter_, self.codes), tf.float32) rets_cluster = tf.reduce_mean(mask * self.R_cluster) reward_scale.append(rets_cluster) self.cluster_returns = tf.stack(reward_scale) # Predict the average cluster value from the prototype (centroid) with tf.compat.v1.variable_scope( "online", reuse=tf.compat.v1.AUTO_REUSE): self.cluster_value_mse_loss = tf.reduce_mean( (get_predictor(n_in=CLUSTER_DIMS, n_out=1)( tf.transpose(self.protos)) - self.cluster_returns)**2) else: self.cluster_value_mse_loss = 0. """ MYOW where k-NN neighbors are replaced by Sinkhorn clusters """ with tf.compat.v1.variable_scope("random", reuse=tf.compat.v1.AUTO_REUSE): # h_codes: n_batch x n_t x n_rkhs act_condit_target, act_invariant_target, _, _ = choose_cnn(X) h_codes_target = tf.transpose( tf.reshape( tf.concat([act_condit_target, act_invariant_target], axis=1), [-1, Config.NUM_ENVS, 256]), (1, 0, 2)) h_t_target = h_codes_target[:, :-1] h_tp1_target = h_codes_target[:, 1:] # h_a_t = tf.transpose(tf.reshape(get_predictor(n_in=ac_space.n,n_out=256,prefix="SH_a_emb")( act_one_hot), (-1,Config.NUM_ENVS,256)), (1,0,2)) h_seq_target = tf.reshape( tf.concat([h_t_target, h_tp1_target], 2), (-1, 256 * Config.CLUSTER_T)) # act_one_hot_target = tf.reshape(tf.one_hot(self.A_cluster,ac_space.n), (-1,ac_space.n)) # h_seq_target = tf.squeeze(tf.squeeze(FiLM(widths=[512,512], name='FiLM_layer')([tf.expand_dims(tf.expand_dims(h_seq_target,1),1), act_one_hot_target]),1),1) y_online = h_seq y_target = tf.stop_gradient(h_seq_target) # y_reward = tf.reshape(self.R_cluster,(-1,1)) # get K closest vectors by Sinkhorn scores # dist = _compute_distance(y_reward,y_reward) dist = _compute_distance(y_online, y_target) k_t = 3 vals, indx = tf.nn.top_k(-dist, k_t + 1, sorted=True) # N_target = y_target with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): v_online_net = get_predictor(n_in=256 * Config.CLUSTER_T, n_out=HIDDEN_DIMS_SSL, prefix='MYOW_v_pred') r_online_net = get_predictor(n_in=HIDDEN_DIMS_SSL, n_out=HIDDEN_DIMS_SSL, prefix='MYOW_r_pred') v_online = v_online_net(y_online) r_online = r_online_net(v_online) with tf.compat.v1.variable_scope("target", reuse=tf.compat.v1.AUTO_REUSE): v_target_net = get_predictor(n_in=256 * Config.CLUSTER_T, n_out=HIDDEN_DIMS_SSL, prefix='MYOW_v_pred') r_target_net = get_predictor(n_in=HIDDEN_DIMS_SSL, n_out=HIDDEN_DIMS_SSL, prefix='MYOW_r_pred') self.myow_loss = 0. for k in range(k_t): indx2 = indx[:, k] N_target = tf.gather(y_target, indx2) v_target = v_target_net(N_target) r_target = r_target_net(v_target) self.myow_loss += tf.reduce_mean(cos_loss( r_online, v_target)) #+ tf.reduce_mean(cos_loss(r_target, v_online)) # with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): # phi_s = get_online_predictor(n_in=256,n_out=CLUSTER_DIMS,prefix='SH_z_pred')(tf.reshape(h_acc[-1],(-1,256))) # self.myow_loss += tf.reduce_mean(cos_loss(phi_s, tf.transpose(tf.gather(self.protos,cluster_idx,axis=1),(1,0)) )) self.myow_loss += self.cluster_value_mse_loss with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): with tf.compat.v1.variable_scope("policy", reuse=tf.compat.v1.AUTO_REUSE): self.pd_train = [ self.pdtype.pdfromlatent(self.h_pi, init_scale=0.01)[0] ] with tf.compat.v1.variable_scope("value", reuse=tf.compat.v1.AUTO_REUSE): self.vf_train = [fc(self.h_v, 'v_0', 1)[:, 0]] # Plain Dropout version: Only fast updates / stochastic latent for VIB self.pd_run = self.pd_train self.vf_run = self.vf_train # For Dropout: Always change layer, so slow layer is never used self.run_dropout_assign_ops = [] # Use the current head for classical PPO updates a0_run = [self.pd_run[0].sample()] neglogp0_run = [self.pd_run[0].neglogp(a0_run[0])] self.initial_state = None def step(ob, update_frac, skill_idx=None, one_hot_skill=None, nce_dict={}, *_args, **_kwargs): if Config.REPLAY: ob = ob.astype(np.float32) a, v, v_i, neglogp = sess.run( [a0_run[0], self.vf_run[0], self.vf_run[0], neglogp0_run[0]], { REP_PROC: ob, Z: one_hot_skill }) return a, v, v_i, self.initial_state, neglogp def rep_vec(ob, *_args, **_kwargs): return sess.run(self.h_pi, {X: ob}) def value(ob, update_frac, one_hot_skill=None, *_args, **_kwargs): return sess.run(self.vf_run, {REP_PROC: ob, Z: one_hot_skill}) def value_i(ob, update_frac, one_hot_skill=None, *_args, **_kwargs): return sess.run(self.vf_run[0], {REP_PROC: ob, Z: one_hot_skill}) def nce_fw_pass(nce_dict): return sess.run([self.vf_i_run, self.rep_loss], nce_dict) def custom_train(ob, rep_vecs): return sess.run([self.rep_loss], {X: ob, REP_PROC: rep_vecs})[0] def compute_codes(ob, act): return sess.run([ tf.reshape(self.codes, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), tf.reshape(self.u_t, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), tf.reshape(self.z_t_1, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), self.h_codes[:, 1:] ], { REP_PROC: ob, self.A_cluster: act }) def compute_hard_codes(ob): return sess.run([self.codes, self.u_t, self.z_t_1], {REP_PROC: ob}) def compute_cluster_returns(returns): return sess.run([self.cluster_returns], {self.R_cluster: returns}) self.X = X self.processed_x = processed_x self.step = step self.value = value self.value_i = value_i self.rep_vec = rep_vec self.custom_train = custom_train self.nce_fw_pass = nce_fw_pass self.encoder = choose_cnn self.REP_PROC = REP_PROC self.Z = Z self.compute_codes = compute_codes self.compute_hard_codes = compute_hard_codes self.compute_cluster_returns = compute_cluster_returns
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) self.rep_loss = None # explicitly create vector space for latent vectors latent_space = Box(-np.inf, np.inf, shape=(256, )) # So that I can compute the saliency map if Config.REPLAY: X = tf.compat.v1.placeholder(shape=(nbatch, ) + ob_space.shape, dtype=np.float32, name='Ob') processed_x = X else: X, processed_x = observation_input(ob_space, None) TRAIN_NUM_STEPS = Config.NUM_STEPS // 16 REP_PROC = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, 64, 64, 3), name='Rep_Proc') Z_INT = tf.compat.v1.placeholder(dtype=tf.int32, shape=(), name='Curr_Skill_idx') Z = tf.compat.v1.placeholder(dtype=tf.float32, shape=(nbatch, Config.N_SKILLS), name='Curr_skill') CODES = tf.compat.v1.placeholder(dtype=tf.float32, shape=(1024, Config.N_SKILLS), name='Train_Codes') CLUSTER_DIMS = 256 HIDDEN_DIMS_SSL = 256 STEP_BOOL = tf.placeholder(tf.bool, shape=[]) self.protos = tf.compat.v1.Variable( initial_value=tf.random.normal(shape=(CLUSTER_DIMS, Config.N_SKILLS)), trainable=True, name='Prototypes') self.A = self.pdtype.sample_placeholder([None], name='A') self.R = tf.compat.v1.placeholder(tf.float32, [None], name='R') # trajectories of length m, for N policy heads. self.STATE = tf.compat.v1.placeholder(tf.float32, [None, 64, 64, 3]) self.STATE_NCE = tf.compat.v1.placeholder( tf.float32, [Config.REP_LOSS_M, 1, None, 64, 64, 3]) self.ANCH_NCE = tf.compat.v1.placeholder(tf.float32, [None, 64, 64, 3]) # labels of Q value quantile bins self.LAB_NCE = tf.compat.v1.placeholder( tf.float32, [Config.POLICY_NHEADS, None]) self.A_i = self.pdtype.sample_placeholder( [None, Config.REP_LOSS_M, 1], name='A_i') self.R_cluster = tf.compat.v1.placeholder(tf.float32, [None]) self.A_cluster = self.pdtype.sample_placeholder( [None, Config.NUM_ENVS], name='A_cluster') self.pse_obs_1 = tf.compat.v1.placeholder(tf.float32, [None, 64, 64, 3]) self.pse_actions_1 = self.pdtype.sample_placeholder([None], name='A_1') self.pse_rewards_1 = tf.compat.v1.placeholder(tf.float32, [None], name='R_1') self.pse_obs_2 = tf.compat.v1.placeholder(tf.float32, [None, 64, 64, 3]) self.pse_actions_2 = self.pdtype.sample_placeholder([None], name='A_2') self.pse_rewards_2 = tf.compat.v1.placeholder(tf.float32, [None], name='R_2') with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn( processed_x) self.train_dropout_assign_ops = fast_dropout_assigned_ops self.run_dropout_assign_ops = slow_dropout_assign_ops self.h = tf.concat([act_condit, act_invariant], axis=1) """ PSEs code """ contrastive_loss_temperature = Config.TEMP with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): n_pse = tf.shape(self.pse_obs_1)[0] concat_pse_obs = tf.concat([self.pse_obs_1, self.pse_obs_2], 0) act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn( concat_pse_obs) h_pse = tf.concat([act_condit, act_invariant], axis=1) representation_1, representation_2 = h_pse[:n_pse], h_pse[n_pse:] # PSE loss act1 = tf.one_hot(self.pse_actions_1, 15) act2 = tf.one_hot(self.pse_actions_2, 15) # act1 = tf.reshape(act1,(Config.NUM_ENVS,-1,15)) # act2 = tf.reshape(act2,(Config.NUM_ENVS,-1,15)) metric_vals = compute_psm_metric(act1, act2, Config.GAMMA) self.contrastive_loss = Config.REP_LOSS_WEIGHT * representation_alignment_loss( representation_1, representation_2, metric_vals, use_coupling_weights=True, temperature=contrastive_loss_temperature, return_representation=False) with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): with tf.compat.v1.variable_scope("head_0", reuse=tf.compat.v1.AUTO_REUSE): self.pd_train = [ self.pdtype.pdfromlatent(tf.stop_gradient(self.h), init_scale=0.01)[0] ] self.vf_train = [fc(self.h, 'v_0', 1)[:, 0]] # Plain Dropout version: Only fast updates / stochastic latent for VIB self.pd_run = self.pd_train self.vf_run = self.vf_train # For Dropout: Always change layer, so slow layer is never used self.run_dropout_assign_ops = [] # Use the current head for classical PPO updates a0_run = [ self.pd_run[head_idx].sample() for head_idx in range(Config.POLICY_NHEADS) ] neglogp0_run = [ self.pd_run[head_idx].neglogp(a0_run[head_idx]) for head_idx in range(Config.POLICY_NHEADS) ] self.initial_state = None def step(ob, update_frac, skill_idx=None, one_hot_skill=None, nce_dict={}, *_args, **_kwargs): if Config.REPLAY: ob = ob.astype(np.float32) head_idx = 0 a, v, neglogp = sess.run([ a0_run[head_idx], self.vf_run[head_idx], neglogp0_run[head_idx] ], {X: ob}) return a, v, self.initial_state, neglogp def rep_vec(ob, *_args, **_kwargs): return sess.run(self.h, {X: ob}) def value(ob, update_frac, one_hot_skill=None, *_args, **_kwargs): if Config.AGENT == 'ppo_diayn': return sess.run(self.vf_run, {X: ob, Z: one_hot_skill}) elif Config.AGENT == 'ppo_goal': return sess.run(self.vf_run, {REP_PROC: ob, Z: one_hot_skill}) else: return sess.run(self.vf_run, {self.STATE: ob, X: ob}) def value_i(ob, update_frac, one_hot_skill=None, *_args, **_kwargs): if Config.AGENT == 'ppo_diayn': return sess.run(self.vf_i_run, {X: ob, Z: one_hot_skill}) elif Config.AGENT == 'ppo_goal': return sess.run(self.vf_i_run, { REP_PROC: ob, Z: one_hot_skill }) else: return sess.run(self.vf_i_run, {self.STATE: ob, X: ob}) def nce_fw_pass(nce_dict): return sess.run([self.vf_i_run, self.rep_loss], nce_dict) def custom_train(ob, rep_vecs): return sess.run([self.rep_loss], {X: ob, REP_PROC: rep_vecs})[0] def compute_codes(ob, act): return sess.run([ tf.reshape(self.codes, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), tf.reshape(self.u_t, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), tf.reshape(self.z_t_1, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), self.h_codes[:, 1:] ], { REP_PROC: ob, self.A_cluster: act }) def compute_hard_codes(ob): return sess.run([self.codes, self.u_t, self.z_t_1], {REP_PROC: ob}) def compute_cluster_returns(returns): return sess.run([self.cluster_returns], {self.R_cluster: returns}) self.X = X self.processed_x = processed_x self.step = step self.value = value self.value_i = value_i self.rep_vec = rep_vec self.custom_train = custom_train self.nce_fw_pass = nce_fw_pass self.encoder = choose_cnn self.REP_PROC = REP_PROC self.Z = Z self.compute_codes = compute_codes self.compute_hard_codes = compute_hard_codes self.compute_cluster_returns = compute_cluster_returns self.CODES = CODES self.STEP_BOOL = STEP_BOOL
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=reuse): conv1 = caps_cnn(processed_x, **conv_kwargs) conv1 = tf.transpose( conv1, [0, 3, 1, 2]) # reshape to expected input format conv1 = tf.expand_dims(conv1, 1) capsule1 = layers.conv_slim_capsule( conv1, input_dim=1, output_dim=32, layer_name='conv_capsule1', num_routing=1, input_atoms=256, output_atoms=8, stride=2, kernel_size=9, padding='VALID', leaky=False, ) capsule1_atom_last = tf.transpose(capsule1, [0, 1, 3, 4, 2]) capsule1_3d = tf.reshape(capsule1_atom_last, [tf.shape(conv1)[0], -1, 8]) _, _, _, height, width = capsule1.get_shape() input_dim1 = 32 * height.value * width.value # main encoding layer h = layers.capsule( input_tensor=capsule1_3d, input_dim=input_dim1, output_dim=8, layer_name='capsule2', input_atoms=8, output_atoms=16, num_routing=3, leaky=False, ) # capsule policy layer hpi = layers.capsule( input_tensor=h, input_dim=8, output_dim=4, layer_name='capsule_pi', input_atoms=16, output_atoms=4, num_routing=3, leaky=False, ) pnorm = tf.reduce_sum(tf.square(hpi), axis=-1) # value function hvf = conv_to_fc(h) vf = fc(hvf, 'v', 1)[:, 0] # policy based on pnorm (the squared norms of policy capsule vecs) self.pd, self.pi = self.pdtype.pdfromflat(pnorm), pnorm a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) self.rep_loss = None # explicitly create vector space for latent vectors latent_space = Box(-np.inf, np.inf, shape=(256, )) # So that I can compute the saliency map if Config.REPLAY: X = tf.compat.v1.placeholder(shape=(nbatch, ) + ob_space.shape, dtype=np.float32, name='Ob') processed_x = X else: X, processed_x = observation_input(ob_space, None) TRAIN_NUM_STEPS = Config.NUM_STEPS // 16 REP_PROC = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, 64, 64, 3), name='Rep_Proc') Z_INT = tf.compat.v1.placeholder(dtype=tf.int32, shape=(), name='Curr_Skill_idx') Z = tf.compat.v1.placeholder(dtype=tf.float32, shape=(nbatch, Config.N_SKILLS), name='Curr_skill') CODES = tf.compat.v1.placeholder(dtype=tf.float32, shape=(1024, Config.N_SKILLS), name='Train_Codes') CLUSTER_DIMS = 256 HIDDEN_DIMS_SSL = 256 STEP_BOOL = tf.placeholder(tf.bool, shape=[]) self.protos = tf.compat.v1.Variable( initial_value=tf.random.normal(shape=(CLUSTER_DIMS, Config.N_SKILLS)), trainable=True, name='Prototypes') self.A = self.pdtype.sample_placeholder([None], name='A') self.R = tf.compat.v1.placeholder(tf.float32, [None], name='R') # trajectories of length m, for N policy heads. self.STATE = tf.compat.v1.placeholder(tf.float32, [None, 64, 64, 3]) self.STATE_NCE = tf.compat.v1.placeholder( tf.float32, [Config.REP_LOSS_M, 1, None, 64, 64, 3]) self.ANCH_NCE = tf.compat.v1.placeholder(tf.float32, [None, 64, 64, 3]) # labels of Q value quantile bins self.LAB_NCE = tf.compat.v1.placeholder( tf.float32, [Config.POLICY_NHEADS, None]) self.A_i = self.pdtype.sample_placeholder( [None, Config.REP_LOSS_M, 1], name='A_i') self.R_cluster = tf.compat.v1.placeholder(tf.float32, [None]) self.A_cluster = self.pdtype.sample_placeholder( [None, Config.NUM_ENVS], name='A_cluster') with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn( processed_x) self.train_dropout_assign_ops = fast_dropout_assigned_ops self.run_dropout_assign_ops = slow_dropout_assign_ops self.h = tf.concat([act_condit, act_invariant], axis=1) """ Bisimulation code """ with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): # encoder loss act_one_hot_target = tf.reshape(tf.one_hot(self.A, ac_space.n), (-1, ac_space.n)) pred_next_latent_mu1 = get_transition_model()(tf.concat( [self.h, act_one_hot_target], axis=1)) pred_next_latent_mu2 = shuffle_custom(pred_next_latent_mu1) z_dist = tf.reduce_mean( tf.compat.v1.losses.huber_loss( self.h, shuffle_custom(self.h), reduction=tf.compat.v1.losses.Reduction.NONE), 1) r_dist = tf.compat.v1.losses.huber_loss( self.R, shuffle_custom(self.R), reduction=tf.compat.v1.losses.Reduction.NONE) transition_dist = tf.reduce_mean( tf.compat.v1.losses.huber_loss( pred_next_latent_mu1, pred_next_latent_mu2, reduction=tf.compat.v1.losses.Reduction.NONE), 1) bisimilarity = r_dist + Config.GAMMA * transition_dist self.encoder_bisimilarity_loss = tf.reduce_mean( tf.math.pow(z_dist - bisimilarity, 2)) # latent loss pred_next_latent_mu1_3d = tf.transpose( tf.reshape(pred_next_latent_mu1, [-1, Config.NUM_ENVS, 256]), (1, 0, 2)) # 32 x n_timesteps x n_hidden h_3d = tf.transpose(tf.reshape(self.h, [-1, Config.NUM_ENVS, 256]), (1, 0, 2)) # 32 x n_timesteps x n_hidden pred_next_latent_mu1 = pred_next_latent_mu1_3d[:, : -1, :] # t = 0 to n_timesteps-1 next_h = h_3d[:, 1:, :] # t = 1 to n_timesteps diff = (pred_next_latent_mu1 - tf.stop_gradient(next_h)) self.latent_transition_loss = tf.reduce_mean(0.5 * tf.math.pow(diff, 2)) with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): with tf.compat.v1.variable_scope("head_0", reuse=tf.compat.v1.AUTO_REUSE): self.pd_train = [ self.pdtype.pdfromlatent(tf.stop_gradient(self.h), init_scale=0.01)[0] ] self.vf_train = [fc(self.h, 'v_0', 1)[:, 0]] # Plain Dropout version: Only fast updates / stochastic latent for VIB self.pd_run = self.pd_train self.vf_run = self.vf_train # For Dropout: Always change layer, so slow layer is never used self.run_dropout_assign_ops = [] # Use the current head for classical PPO updates a0_run = [ self.pd_run[head_idx].sample() for head_idx in range(Config.POLICY_NHEADS) ] neglogp0_run = [ self.pd_run[head_idx].neglogp(a0_run[head_idx]) for head_idx in range(Config.POLICY_NHEADS) ] self.initial_state = None def step(ob, update_frac, skill_idx=None, one_hot_skill=None, nce_dict={}, *_args, **_kwargs): if Config.REPLAY: ob = ob.astype(np.float32) head_idx = 0 a, v, neglogp = sess.run([ a0_run[head_idx], self.vf_run[head_idx], neglogp0_run[head_idx] ], {X: ob}) return a, v, self.initial_state, neglogp def rep_vec(ob, *_args, **_kwargs): return sess.run(self.h, {X: ob}) def value(ob, update_frac, one_hot_skill=None, *_args, **_kwargs): if Config.AGENT == 'ppo_diayn': return sess.run(self.vf_run, {X: ob, Z: one_hot_skill}) elif Config.AGENT == 'ppo_goal': return sess.run(self.vf_run, {REP_PROC: ob, Z: one_hot_skill}) else: return sess.run(self.vf_run, {self.STATE: ob, X: ob}) def value_i(ob, update_frac, one_hot_skill=None, *_args, **_kwargs): if Config.AGENT == 'ppo_diayn': return sess.run(self.vf_i_run, {X: ob, Z: one_hot_skill}) elif Config.AGENT == 'ppo_goal': return sess.run(self.vf_i_run, { REP_PROC: ob, Z: one_hot_skill }) else: return sess.run(self.vf_i_run, {self.STATE: ob, X: ob}) def nce_fw_pass(nce_dict): return sess.run([self.vf_i_run, self.rep_loss], nce_dict) def custom_train(ob, rep_vecs): return sess.run([self.rep_loss], {X: ob, REP_PROC: rep_vecs})[0] def compute_codes(ob, act): return sess.run([ tf.reshape(self.codes, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), tf.reshape(self.u_t, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), tf.reshape(self.z_t_1, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), self.h_codes[:, 1:] ], { REP_PROC: ob, self.A_cluster: act }) def compute_hard_codes(ob): return sess.run([self.codes, self.u_t, self.z_t_1], {REP_PROC: ob}) def compute_cluster_returns(returns): return sess.run([self.cluster_returns], {self.R_cluster: returns}) self.X = X self.processed_x = processed_x self.step = step self.value = value self.value_i = value_i self.rep_vec = rep_vec self.custom_train = custom_train self.nce_fw_pass = nce_fw_pass self.encoder = choose_cnn self.REP_PROC = REP_PROC self.Z = Z self.compute_codes = compute_codes self.compute_hard_codes = compute_hard_codes self.compute_cluster_returns = compute_cluster_returns self.CODES = CODES self.STEP_BOOL = STEP_BOOL
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) # So that I can compute the saliency map if Config.REPLAY: X = tf.placeholder(shape=(nbatch, ) + ob_space.shape, dtype=np.float32, name='Ob') processed_x = X else: X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): h, h_vf, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn( processed_x) self.train_dropout_assign_ops = fast_dropout_assigned_ops self.run_dropout_assign_ops = slow_dropout_assign_ops # Noisy policy and value function for train if Config.BETA >= 0: pdparam = _matching_fc(h, 'pi', ac_space.n, init_scale=1.0, init_bias=0) pdparam = tf.reshape(pdparam, shape=(Config.NR_SAMPLES, -1, ac_space.n)) pdparam = tf.transpose(pdparam, perm=[1, 0, 2]) dists = ds.Categorical(logits=pdparam) self.pd_train = ds.MixtureSameFamily( mixture_distribution=ds.Categorical( probs=[1. / Config.NR_SAMPLES] * Config.NR_SAMPLES), components_distribution=dists) self.pd_train.neglogp = lambda a: -self.pd_train.log_prob(a) self.vf_train = tf.reduce_mean( tf.reshape(fc(h, 'v', 1), shape=(Config.NR_SAMPLES, -1, 1)), 0)[:, 0] else: self.pd_train, _ = self.pdtype.pdfromlatent(h, init_scale=0.01) self.vf_train = fc(h, 'v', 1)[:, 0] if Config.SNI: assert Config.DROPOUT == 0 assert not Config.OPENAI # Used with VIB: Noiseless pd_run and _both_ value functions print("Activating SNI (includes VF)") # Use deterministic value function for both as VIB for regression seems like a bad idea self.vf_run = self.vf_train = fc(h_vf, 'v', 1)[:, 0] # Have a deterministic run policy based on the mean self.pd_run, _ = self.pdtype.pdfromlatent(h_vf, init_scale=0.01) elif Config.SNI2: assert not Config.OPENAI # Used with Dropout instead of OPENAI modifier # 'RUN' versions are updated slowly, train versions updated faster, gradients are mixed print("Activating SNI2") # Deterministic bootstrap value... doesn't really matter but this is more consistent self.vf_run = fc(h_vf, 'v', 1)[:, 0] # Run policy based on slow changing latent self.pd_run, _ = self.pdtype.pdfromlatent(h_vf, init_scale=0.01) # Train is updated for each gradient update, slow is only updated once per batch elif Config.OPENAI: # Completely overwrite train versions as everything changes slowly # Train version is same as run version, both of which are slow self.pd_run, _ = self.pdtype.pdfromlatent(h_vf, init_scale=0.01) self.pd_train = self.pd_run self.vf_run = self.vf_train = fc(h_vf, 'v', 1)[:, 0] # Stochastic version is never used, so can set to ignore self.train_dropout_assign_ops = [] else: # Plain Dropout version: Only fast updates / stochastic latent for VIB self.pd_run = self.pd_train self.vf_run = self.vf_train # For Dropout: Always change layer, so slow layer is never used self.run_dropout_assign_ops = [] # Used in step a0_run = self.pd_run.sample() neglogp0_run = self.pd_run.neglogp(a0_run) self.initial_state = None def step(ob, update_frac, *_args, **_kwargs): if Config.REPLAY: ob = ob.astype(np.float32) a, v, neglogp = sess.run([a0_run, self.vf_run, neglogp0_run], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, update_frac, *_args, **_kwargs): return sess.run(self.vf_run, {X: ob}) self.X = X self.processed_x = processed_x self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): # if USE_COLOR_TRANSFORM: out_shape = processed_x.get_shape().as_list() mask_vbox = tf.Variable(tf.zeros_like(processed_x, dtype=bool), trainable=False) rh = .2 # hard-coded velocity box size # mh = tf.cast(tf.cast(out_shape[1], dtype=tf.float32)*rh, dtype=tf.int32) mh = int(out_shape[1] * rh) mw = mh * 2 mask_vbox = mask_vbox[:, :mh, :mw].assign( tf.ones([out_shape[0], mh, mw, out_shape[-1]], dtype=bool)) masked = tf.where(mask_vbox, x=tf.zeros_like(processed_x), y=processed_x) # tf.image.adjust_brightness vs. ImageEnhance.Brightness # tf version is additive while PIL version is multiplicative delta_brightness = tf.get_variable( name='randprocess_brightness', initializer=tf.random_uniform([], -.5, .5), trainable=False) # tf.image.adjust_contrast vs. PIL.ImageEnhance.Contrast delta_contrast = tf.get_variable( name='randprocess_contrast', initializer=tf.random_uniform([], .5, 1.5), trainable=False, ) # tf.image.adjust_saturation vs. PIL.ImageEnhance.Color delta_saturation = tf.get_variable( name='randprocess_saturation', initializer=tf.random_uniform([], .5, 1.5), trainable=False, ) processed_x1 = tf.image.adjust_brightness( masked, delta_brightness) processed_x1 = tf.clip_by_value(processed_x1, 0., 255.) processed_x1 = tf.where(mask_vbox, x=masked, y=processed_x1) processed_x2 = tf.image.adjust_contrast( processed_x1, delta_contrast) processed_x2 = tf.clip_by_value(processed_x2, 0., 255.) processed_x2 = tf.where(mask_vbox, x=masked, y=processed_x2) processed_x3 = tf.image.adjust_saturation( processed_x2, delta_saturation) processed_x3 = tf.clip_by_value(processed_x3, 0., 255.) processed_x3 = tf.where(mask_vbox, x=processed_x, y=processed_x3) else: processed_x3 = processed_x # h, self.dropout_assign_ops = choose_cnn(processed_x3) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, name='policy', args=None): #pylint: disable=W0613 policy_variance_state_dependent = args.policy_variance_state_dependent ac_fn = args.ac_fn hidden_sizes = args.hidden_sizes num_sharing_layers = args.num_sharing_layers num_layers = args.num_layers assert ac_fn in ['tanh', 'sigmoid', 'relu'] if isinstance(hidden_sizes, int): assert num_layers is not None hidden_sizes = [hidden_sizes] * num_layers if num_layers is None: num_layers = len(hidden_sizes) assert num_layers == len(hidden_sizes) # print(f'Policy hidden_sizes:{hidden_sizes}') self.pdtype = make_pdtype(ac_space) with tf.variable_scope(name, reuse=reuse): X, processed_x = observation_input(ob_space, nbatch) activ = getattr( tf.nn, ac_fn ) processed_x = tf.layers.flatten(processed_x) # --- share layers for ind_layer in range(num_sharing_layers): processed_x = activ( fc(processed_x, f'share_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)) ) # --- policy pi_h = processed_x for ind_layer in range( num_sharing_layers, num_layers ): pi_h = activ(fc(pi_h, f'pi_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2))) from gym import spaces params_addtional = {} if policy_variance_state_dependent and isinstance( ac_space, spaces.Box ): latent_logstd = processed_x for ind_layer in range(num_sharing_layers, num_layers): latent_logstd = activ(fc(latent_logstd, f'logstd_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2))) params_addtional['latent_logstd'] = latent_logstd self.pd, self.pi = self.pdtype.pdfromlatent(pi_h, init_scale=0.01, logstd_initial=args.logstd, **params_addtional) # --- value function vf_h = processed_x for ind_layer in range( num_sharing_layers, num_layers ): vf_h = activ(fc(vf_h, f'vf_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2))) vf = fc(vf_h, 'vf', 1)[:,0] a_sample = self.pd.sample() neglogp_sample = self.pd.neglogp(a_sample) self.initial_state = None # --- predict function # use placeholder # use stochastic action # use deterministic action if args.coef_predict_task > 0: import tensorflow.contrib.distributions as dists assert isinstance( ac_space, Box ), 'Only Implement for Box action space' A_type = tf.placeholder_with_default('pl', dtype=tf.string) A_pl = self.pdtype.sample_placeholder([None]) self.A = A_pl self.A_type = A_type A_input_1 = U.switch( tf.equal( A_type, 'det' ), self.pd.mode(), a_sample ) A_input = U.switch( tf.equal( A_type, 'pl' ), A_pl,A_input_1) predict_h = tf.concat( (processed_x, A_input)) for ind_layer in range(num_sharing_layers, num_layers): predict_h = activ(fc(predict_h, f'predict_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2))) predict_mean = fc(predict_h, f'predict_fc{ind_layer}', nh=ob_space.shape[0], init_scale=np.sqrt(2)) predict_cov_init_value = np.identity( shape=ob_space.shape ) predict_cov = tf.get_variable( name='predict_cov', shape=predict_cov_init_value, initializer=tf.constant_initializer(predict_cov_init_value) ) predict_dist = dists.MultivariateNormalTriL( predict_mean, predict_cov ) self.predict_dist = predict_dist scope_model = tf.get_variable_scope().name self.variables_all = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope_model) self.variables_trainable = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope_model) #--- set logstd # if isinstance( ac_space, Box ): # if not policy_variance_state_dependent: # logstd_pl, _ = observation_input( ac_space, batch_size=1, name='ac' ) # assign_logstd = tf.assign( self.pdtype.logstd, logstd_pl ) # set_logstd_entity = U.function([logstd_pl], assign_logstd) # def set_logstd(logstd_new): # # if isinstance( logstd_new, float ): # # logstd_new = [[logstd_new] * ac_space.shape[0]] # set_logstd_entity(logstd_new) # self.set_logstd = set_logstd # self.get_logstd = U.function([], self.pdtype.logstd) def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a_sample, vf, neglogp_sample], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) def step_policyflat(ob, *_args, **_kwargs): a, v, neglogp, polciyflat = sess.run([a_sample, vf, neglogp_sample, self.pd.flatparam()], {X:ob}) #TODO: TEST flat for discrete action space return a, v, self.initial_state, neglogp, polciyflat def step_test(ob, *_args, **_kwargs): a = sess.run([self.pd.mode()], {X:ob}) return a self.X = X self.vf = vf self.step = step self.step_policyflat = step_policyflat self.value = value self.step_test = step_test
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) self.rep_loss = None # explicitly create vector space for latent vectors latent_space = Box(-np.inf, np.inf, shape=(256, )) # So that I can compute the saliency map if Config.REPLAY: X = tf.compat.v1.placeholder(shape=(nbatch, ) + ob_space.shape, dtype=np.float32, name='Ob') processed_x = X else: X, processed_x = observation_input(ob_space, None) TRAIN_NUM_STEPS = Config.NUM_STEPS // 16 REP_PROC = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, 64, 64, 3), name='Rep_Proc') Z_INT = tf.compat.v1.placeholder(dtype=tf.int32, shape=(), name='Curr_Skill_idx') Z = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, Config.N_SKILLS), name='Curr_skill') CLUSTER_DIMS = 128 HIDDEN_DIMS_SSL = 256 self.protos = tf.compat.v1.Variable( initial_value=tf.random.normal(shape=(CLUSTER_DIMS, Config.N_SKILLS)), trainable=True, name='Prototypes') self.A = self.pdtype.sample_placeholder([None], name='A') # trajectories of length m, for N policy heads. self.STATE = tf.compat.v1.placeholder(tf.float32, [None, 64, 64, 3]) self.STATE_NCE = tf.compat.v1.placeholder( tf.float32, [Config.REP_LOSS_M, 1, None, 64, 64, 3]) self.ANCH_NCE = tf.compat.v1.placeholder(tf.float32, [None, 64, 64, 3]) # labels of Q value quantile bins self.LAB_NCE = tf.compat.v1.placeholder( tf.float32, [Config.POLICY_NHEADS, None]) self.A_i = self.pdtype.sample_placeholder( [None, Config.REP_LOSS_M, 1], name='A_i') self.R_cluster = tf.compat.v1.placeholder(tf.float32, [None], name='R_cluster') self.A_cluster = self.pdtype.sample_placeholder([None], name='A_cluster') X = REP_PROC #tf.reshape(REP_PROC, [-1, 64, 64, 3]) with tf.compat.v1.variable_scope( "target" if Config.STOP_GRAD_PPO else "online", reuse=tf.compat.v1.AUTO_REUSE): act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn( X) self.train_dropout_assign_ops = fast_dropout_assigned_ops self.run_dropout_assign_ops = slow_dropout_assign_ops with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): self.h = tf.concat([act_condit, act_invariant], axis=1) """ Clustering part """ N_ACTIONS = 5 if Config.ENVIRONMENT == 'ising' else 15 with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): # h_codes: n_batch x n_t x n_rkhs act_condit, act_invariant, _, _ = choose_cnn(X) self.h_codes = tf.transpose( tf.reshape(tf.concat([act_condit, act_invariant], axis=1), [-1, Config.NUM_ENVS, 256]), (1, 0, 2)) act_one_hot = tf.transpose( tf.reshape(tf.one_hot(self.A_cluster, ac_space.n), [-1, Config.NUM_ENVS, ac_space.n]), (1, 0, 2)) h_acc = [] h_acc_no_act = [] for k in range(Config.CLUSTER_T): h_t = self.h_codes[:, k:tf.shape(self.h_codes)[1] - (Config.CLUSTER_T - k - 1)] a_t = act_one_hot[:, k:tf.shape(act_one_hot)[1] - (Config.CLUSTER_T - k - 1)] h_t_film = tf.reshape( FiLM(widths=[128], name='FiLM_layer')([ tf.expand_dims( tf.expand_dims(tf.reshape(h_t, (-1, 256)), 1), 1), tf.reshape(a_t, (-1, N_ACTIONS)) ])[:, 0, 0], (Config.NUM_ENVS, -1, 256)) h_acc_no_act.append(tf.reshape(h_t, (Config.NUM_ENVS, -1, 256))) h_acc.append(h_t_film) # h_seq_no_act = tf.reshape( tf.concat(h_acc_no_act,2), (-1,256*Config.CLUSTER_T)) h_seq = tf.reshape(tf.concat(h_acc, 2), (-1, 256 * Config.CLUSTER_T)) self.h_seq = h_seq # self.z_t_no_act = get_online_predictor(n_in=256*Config.CLUSTER_T,n_out=CLUSTER_DIMS,prefix='SH_z_pred_no_act')(h_seq_no_act) self.z_t = get_online_predictor(n_in=256 * Config.CLUSTER_T, n_out=CLUSTER_DIMS, prefix='SH_z_pred')(h_seq) self.u_t = get_predictor(n_in=CLUSTER_DIMS, n_out=CLUSTER_DIMS, prefix='SH_u_pred')(self.z_t) self.z_t_1 = self.z_t # scores: n_batch x n_clusters # tf.linalg.normalize(self.z_t_1, axis=1, ord='euclidean')[0] # tf.linalg.normalize(self.protos, axis=1, ord='euclidean')[0] scores = tf.linalg.matmul( tf.linalg.normalize(self.z_t_1, axis=1, ord='euclidean')[0], tf.linalg.normalize(self.protos, axis=1, ord='euclidean')[0]) self.codes = sinkhorn(scores=scores) self.myow_loss = 0. if Config.MYOW: """ MYOW where k-NN neighbors are replaced by Sinkhorn clusters """ # with tf.compat.v1.variable_scope("random", reuse=tf.compat.v1.AUTO_REUSE): # # h_codes: n_batch x n_t x n_rkhs # act_condit_target, act_invariant_target, _, _ = choose_cnn(X) # h_codes_target = tf.transpose(tf.reshape(tf.concat([act_condit_target, act_invariant_target], axis=1),[-1,Config.NUM_ENVS,256]),(1,0,2)) # h_t_target = h_codes_target[:,:-1] # h_tp1_target = h_codes_target[:,1:] # # h_a_t = tf.transpose(tf.reshape(get_predictor(n_in=ac_space.n,n_out=256,prefix="SH_a_emb")( act_one_hot), (-1,Config.NUM_ENVS,256)), (1,0,2)) # h_seq_target = tf.reshape( tf.concat([h_t_target,h_tp1_target],2), (-1,256*Config.CLUSTER_T)) # act_one_hot_target = tf.reshape(tf.one_hot(self.A_cluster,ac_space.n), (-1,ac_space.n)) # h_seq_target = tf.squeeze(tf.squeeze(FiLM(widths=[512,512], name='FiLM_layer')([tf.expand_dims(tf.expand_dims(h_seq_target,1),1), act_one_hot_target]),1),1) y_online = h_seq y_target = tf.stop_gradient(h_seq) # y_reward = tf.reshape(self.R_cluster,(-1,1)) # Find cluster adjacency scores dist = _compute_distance(tf.transpose(self.protos), tf.transpose(self.protos)) k_t = Config.N_KNN vals, indx = tf.nn.top_k(-dist, k_t + 1, sorted=True) cluster_idx = tf.cast(tf.argmax(scores, 1), tf.int32) cluster_membership_list = [] for i in range(Config.N_SKILLS): filter_ = tf.cast(tf.fill(tf.shape(cluster_idx), i), tf.int32) mask = tf.math.equal(filter_, cluster_idx) cluster_vecs = tf.cast(tf.where(mask), tf.int32) cluster_vecs = tf.cond( tf.math.equal(tf.shape(cluster_vecs)[0], 0), lambda: tf.constant([[0]], tf.int32), lambda: cluster_vecs) # cluster_idx = tf.cast(tf.round(tf.random.uniform((1,),maxval=tf.cast(tf.shape(cluster_vecs),tf.float32))[0]),tf.int32) # randomly sample a vector from its cluster cluster_membership_list.append( cluster_vecs[0] ) # take first vector of this cluster as representative cluster_membership_list = tf.stack(cluster_membership_list) # import ipdb;ipdb.set_trace() # N_target = y_target with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): v_online_net = get_predictor(n_in=256 * Config.CLUSTER_T, n_out=HIDDEN_DIMS_SSL, prefix='MYOW_v_pred') r_online_net = get_predictor(n_in=HIDDEN_DIMS_SSL, n_out=HIDDEN_DIMS_SSL, prefix='MYOW_r_pred') v_online = v_online_net(y_online) r_online = r_online_net(v_online) with tf.compat.v1.variable_scope("target", reuse=tf.compat.v1.AUTO_REUSE): v_target_net = get_predictor(n_in=256 * Config.CLUSTER_T, n_out=HIDDEN_DIMS_SSL, prefix='MYOW_v_pred') r_target_net = get_predictor(n_in=HIDDEN_DIMS_SSL, n_out=HIDDEN_DIMS_SSL, prefix='MYOW_r_pred') for k in range(k_t): nearby_cluster_idx = tf.gather(indx[:, k + 1], cluster_idx) nearby_batch_vecs = tf.reshape( tf.gather(cluster_membership_list, tf.cast(nearby_cluster_idx, tf.int32)), (-1, )) N_target = tf.gather(y_target, nearby_batch_vecs) v_target = v_target_net(N_target) # r_target = r_target_net(v_target) self.myow_loss += tf.reduce_mean(cos_loss( r_online, v_target)) #+ tf.reduce_mean(cos_loss(r_target, v_online)) # with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): # phi_s = get_online_predictor(n_in=256,n_out=CLUSTER_DIMS,prefix='SH_z_pred')(tf.reshape(h_acc[-1],(-1,256))) # self.myow_loss += tf.reduce_mean(cos_loss(phi_s, tf.transpose(tf.gather(self.protos,cluster_idx,axis=1),(1,0)) )) """ Intrinsic rewards """ with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): self.R_I_SCALE = tf.nn.relu( get_linear_layer(n_in=256, n_out=1, prefix='r_i_scale', init=initializers.RandomNormal(stddev=0.11))( tf.reshape(tf.stop_gradient(h_acc[-1]), (-1, 256)))) # self.h = get_predictor(n_in=256+Config.N_SKILLS,n_out=256)(tf.concat([self.h,tf.stop_gradient(scores)],1)) """ Condition on soft-cluster assignments for policy head (Cluster Conditioned Policy ) """ if Config.CLUSTER_CONDIT_POLICY: concat_code = tf.stop_gradient( tf.reshape(self.codes, [-1, Config.N_SKILLS])) # print(self.h) # print(concat_code) #self.h = tf.concat([self.h, concat_code], axis=1) #h_seq = tf.squeeze(tf.squeeze(FiLM(widths=[512,512], name='FiLM_layer')([tf.expand_dims(tf.expand_dims(h_seq,1),1), act_one_hot]),1),1) with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): if Config.CUSTOM_REP_LOSS and Config.POLICY_NHEADS > 1: self.pd_train = [] for i in range(Config.POLICY_NHEADS): with tf.compat.v1.variable_scope( "head_" + str(i), reuse=tf.compat.v1.AUTO_REUSE): self.pd_train.append( self.pdtype.pdfromlatent(self.h, init_scale=0.01)[0]) with tf.compat.v1.variable_scope( "head_i", reuse=tf.compat.v1.AUTO_REUSE): self.pd_train_i = self.pdtype.pdfromlatent( self.h, init_scale=0.01)[0] else: with tf.compat.v1.variable_scope( "head_0", reuse=tf.compat.v1.AUTO_REUSE): self.pd_train = [ self.pdtype.pdfromlatent(self.h, init_scale=0.01)[0] ] if Config.CUSTOM_REP_LOSS and Config.POLICY_NHEADS > 1: # self.vf_train = [fc(self.h, 'v'+str(i), 1)[:, 0] for i in range(Config.POLICY_NHEADS)] self.vf_train = [fc(self.h, 'v_0', 1)[:, 0]] else: self.vf_train = [fc(self.h, 'v_0', 1)[:, 0]] self.vf_i_train = fc(tf.stop_gradient(self.h), 'v_i', 1)[:, 0] self.vf_i_run = self.vf_i_train # Plain Dropout version: Only fast updates / stochastic latent for VIB self.pd_run = self.pd_train self.vf_run = self.vf_train # For Dropout: Always change layer, so slow layer is never used self.run_dropout_assign_ops = [] # Use the current head for classical PPO updates a0_run = [ self.pd_run[head_idx].sample() for head_idx in range(Config.POLICY_NHEADS) ] neglogp0_run = [ self.pd_run[head_idx].neglogp(a0_run[head_idx]) for head_idx in range(Config.POLICY_NHEADS) ] self.initial_state = None def step(ob, update_frac, skill_idx=None, one_hot_skill=None, nce_dict={}, *_args, **_kwargs): if Config.REPLAY: ob = ob.astype(np.float32) a, v, v_i, neglogp = sess.run( [a0_run[0], self.vf_run[0], self.vf_i_run, neglogp0_run[0]], { REP_PROC: ob, Z: one_hot_skill }) return a, v, v_i, self.initial_state, neglogp def rep_vec(ob, *_args, **_kwargs): return sess.run(self.h, {X: ob}) def value(ob, update_frac, one_hot_skill=None, *_args, **_kwargs): return sess.run(self.vf_run, {REP_PROC: ob, Z: one_hot_skill}) def value_i(ob, update_frac, one_hot_skill=None, *_args, **_kwargs): return sess.run(self.vf_i_run, {REP_PROC: ob, Z: one_hot_skill}) def nce_fw_pass(nce_dict): return sess.run([self.vf_i_run, self.rep_loss], nce_dict) def custom_train(ob, rep_vecs): return sess.run([self.rep_loss], {X: ob, REP_PROC: rep_vecs})[0] def compute_codes(ob, act): return sess.run([ tf.reshape(self.codes, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), tf.reshape(self.u_t, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), tf.reshape(self.z_t_1, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), self.h_codes[:, 1:] ], { REP_PROC: ob, self.A_cluster: act }) def compute_hard_codes(ob): return sess.run([self.codes, self.u_t, self.z_t_1], {REP_PROC: ob}) def compute_cluster_returns(returns): return sess.run([self.cluster_returns], {self.R_cluster: returns}) self.X = X self.processed_x = processed_x self.step = step self.value = value self.value_i = value_i self.rep_vec = rep_vec self.custom_train = custom_train self.nce_fw_pass = nce_fw_pass self.encoder = choose_cnn self.REP_PROC = REP_PROC self.Z = Z self.compute_codes = compute_codes self.compute_hard_codes = compute_hard_codes self.compute_cluster_returns = compute_cluster_returns