def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None): ob_space = env.observation_space X = observ_placeholder if observ_placeholder is not None else observation_placeholder( ob_space, batch_size=nbatch) extra_tensors = {} if normalize_observations and X.dtype == tf.float32: encoded_x, rms = _normalize_clip_observation(X) extra_tensors['rms'] = rms else: encoded_x = X encoded_x = encode_observation(ob_space, encoded_x) with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent = policy_network(encoded_x) if isinstance(policy_latent, tuple): policy_latent, recurrent_tensors = policy_latent if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format( nbatch, nsteps) policy_latent, recurrent_tensors = policy_network( encoded_x, nenv) extra_tensors.update(recurrent_tensors) _v_net = value_network if _v_net is None or _v_net == 'shared': vf_latent = policy_latent else: if _v_net == 'copy': _v_net = policy_network else: assert callable(_v_net) with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): # TODO recurrent architectures are not supported with value_network=copy yet vf_latent = _v_net(encoded_x) policy = PolicyWithValue(env=env, observations=X, latent=policy_latent, vf_latent=vf_latent, sess=sess, estimate_q=estimate_q, **extra_tensors) return policy
def observation_input(ob_space, batch_size=None, name='Ob'): from gym.spaces import Discrete, Box, MultiDiscrete from baselines.common.input import encode_observation assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \ 'Baselines only deal with Discrete and Box observation spaces' dtype = ob_space.dtype if dtype == np.int8: dtype = np.uint8 shape = (ob_space.shape[0], ob_space.shape[1], ob_space.shape[2]) placeholder = tf.placeholder(shape=(batch_size,) + shape, dtype=dtype, name=name) return placeholder, encode_observation(ob_space, placeholder)
def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None): ob_space = env.observation_space X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch) extra_tensors = {} if normalize_observations and X.dtype == tf.float32: encoded_x, rms = _normalize_clip_observation(X) extra_tensors['rms'] = rms else: encoded_x = X encoded_x = encode_observation(ob_space, encoded_x) with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent = policy_network(encoded_x) if isinstance(policy_latent, tuple): policy_latent, recurrent_tensors = policy_latent if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(nbatch, nsteps) policy_latent, recurrent_tensors = policy_network(encoded_x, nenv) extra_tensors.update(recurrent_tensors) _v_net = value_network if _v_net is None or _v_net == 'shared': vf_latent = policy_latent else: if _v_net == 'copy': _v_net = policy_network else: assert callable(_v_net) with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): # TODO recurrent architectures are not supported with value_network=copy yet vf_latent = _v_net(encoded_x) policy = PolicyWithValue( env=env, observations=X, latent=policy_latent, vf_latent=vf_latent, sess=sess, estimate_q=estimate_q, **extra_tensors ) return policy
def __init__(self, env, nbatch, nsteps, nenvs, network, **policy_kwargs): self.nbatch = nbatch self.nsteps = nsteps self.nenvs = nenvs self.ob_space = env.observation_space self.OUT = tf.placeholder(tf.float32, [nenvs]) self.X = observation_placeholder(self.ob_space, batch_size=nbatch) encoded_x = encode_observation(self.ob_space, self.X) with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): self.net = get_network_builder(network)(**policy_kwargs) self.h1 = self.net(encoded_x) self.h2 = fc(self.h1, 'vf', 1) self.out = self.h2[:, 0]
def policy_fn(scope_name="pi", nbatch=None, nsteps=None, sess=sess, observ_placeholder=None): X = observ_placeholder if observ_placeholder is not None else observation_placeholder( ob_space, batch_size=nbatch) extra_tensors = {} if normalize_observations and X.dtype == tf.float32: encoded_x, rms = _normalize_clip_observation(X) extra_tensors['rms'] = rms else: encoded_x = X encoded_x = encode_observation(ob_space, encoded_x) with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent, recurrent_tensors = policy_network(encoded_x) if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format( nbatch, nsteps) policy_latent, recurrent_tensors = policy_network( encoded_x, nenv) extra_tensors.update(recurrent_tensors) policy = Policy(observations=X, action_space=ac_space, latent=policy_latent, sess=sess, train=train, beta=beta, l2=l2, lr=lr, init_scale=init_scale, init_bias=init_bias, trainable_variance=trainable_variance, trainable_bias=trainable_bias, init_logstd=init_logstd, scope_name=scope_name, clip=clip, class_weights=class_weights, **extra_tensors) return policy
def pi_vf_fn(X, extra_tensors, nbatch, nsteps, recurrent_subname=None): """Shared network to extract latent feature for ob, ob_next""" ob_space = env.observation_space if normalize_observations and X.dtype == tf.float32: encoded_x, rms = _normalize_clip_observation(X) extra_tensors['rms'] = rms else: encoded_x = X encoded_x = encode_observation(ob_space, encoded_x) with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): cnn_fm, policy_latent, recurrent_tensors = policy_network( encoded_x) if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format( nbatch, nsteps) cnn_fm, policy_latent, recurrent_tensors = policy_network( encoded_x, nenv) if recurrent_subname is not None: new_recurrent_tensors = {} for k, v in recurrent_tensors.items(): new_recurrent_tensors[recurrent_subname + '_' + k] = v extra_tensors.update(new_recurrent_tensors) else: extra_tensors.update(recurrent_tensors) with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): _v_net = value_network if _v_net is None or _v_net == 'shared': vf_latent = policy_latent elif _v_net == 'gap': vf_latent = global_average_pooling(cnn_fm, **policy_kwargs) else: raise NotImplementedError vf = fc(vf_latent, 'vf_fc', 1)[:, 0] return cnn_fm, policy_latent, vf
def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None): ob_space = env.observation_space X = observ_placeholder if observ_placeholder is not None else observation_placeholder( ob_space, batch_size=nbatch) extra_tensors = {} if normalize_observations and X.dtype == tf.float32: encoded_x, rms = _normalize_clip_observation(X) extra_tensors['rms'] = rms else: encoded_x = X encoded_x = encode_observation(ob_space, encoded_x) with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent, policy_latent_mean, info_loss = policy_network( encoded_x) if isinstance(policy_latent, tuple): raise NotImplementedError() policy = PolicyWithValue( env=env, observations=X, arch=arch, latent=policy_latent, latent_mean=policy_latent_mean, info_loss=info_loss, # vf_latent=vf_latent, sess=sess, estimate_q=estimate_q, **extra_tensors) return policy
def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None, encoded_x=None): ob_space = env.observation_space extra_tensors = {} if observ_placeholder is None: X = observation_placeholder(ob_space, batch_size=nbatch) if normalize_observations and X.dtype == tf.float32: new_encoded_x, rms = _normalize_clip_observation(X) extra_tensors['rms'] = rms else: new_encoded_x = X new_encoded_x = encode_observation(ob_space, new_encoded_x) new_encoded_x = get_network_builder("cnn")( **policy_kwargs)(new_encoded_x) else: X = observ_placeholder new_encoded_x = encoded_x with tf.variable_scope('pi' + str(head), reuse=tf.AUTO_REUSE): policy_latent = policy_network(new_encoded_x) if isinstance(policy_latent, tuple): policy_latent, recurrent_tensors = policy_latent if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format( nbatch, nsteps) policy_latent, recurrent_tensors = policy_network( new_encoded_x, nenv) extra_tensors.update(recurrent_tensors) _v_net = value_network if _v_net is None or _v_net == 'shared': vf_latent = policy_latent else: if _v_net == 'copy': _v_net = policy_network else: assert callable(_v_net) with tf.variable_scope('vf' + str(head), reuse=tf.AUTO_REUSE): vf_latent, _ = _v_net(new_encoded_x) policy = PolicyWithValue( env=env, observations=X, latent=policy_latent, head=head, vf_latent=vf_latent, #this is the same as policy_latent... sess=sess, estimate_q=estimate_q, **extra_tensors) #print(policy.vf) return policy, X, new_encoded_x
def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None, randomization=True): ob_space = env.observation_space extra_tensors = {} X = observ_placeholder if observ_placeholder is not None else observation_placeholder( ob_space, batch_size=None) encoded_x = encode_observation(ob_space, X) # Randomization if randomization: encoded_x = tf.layers.conv2d( encoded_x / 255., 3, 3, padding='same', kernel_initializer=tf.initializers.glorot_normal(), trainable=False, name='randcnn') * 255. randcnn_param = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="ppo2_model/randcnn") extra_tensors['randcnn_param'] = randcnn_param with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent = policy_network(encoded_x) extra_tensors['latent_fts'] = policy_latent if isinstance(policy_latent, tuple): policy_latent, recurrent_tensors = policy_latent if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format( nbatch, nsteps) policy_latent, recurrent_tensors = policy_network( encoded_x, nenv) extra_tensors.update(recurrent_tensors) _v_net = value_network if _v_net is None or _v_net == 'shared': vf_latent = policy_latent else: if _v_net == 'copy': _v_net = policy_network else: assert callable(_v_net) with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): # TODO recurrent architectures are not supported with value_network=copy yet vf_latent = _v_net(encoded_x) policy = PolicyWithValue(env=env, observations=X, latent=policy_latent, vf_latent=vf_latent, sess=sess, estimate_q=estimate_q, **extra_tensors) return policy
def dynamics_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None, index=None): ob_space = env.observation_space # ac_space = env.action_space # print("shape", (64,) + (ob_space.shape[0] + ac_space.shape[0], )) # Assume we have the same type for state and action space (Continuous - Continuous, Discrete - Discrete) # assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \ # 'Can only deal with Discrete and Box observation spaces for now' # # dtype = ob_space.dtype # if dtype == np.int8: # dtype = np.uint8 #X = tf.placeholder(shape=(nbatch,) + (ob_space.shape[0] + ac_space.shape[0], ), dtype=dtype, name='dyn_input') X = observ_placeholder if observ_placeholder is not None else observation_placeholder( ob_space, batch_size=nbatch) extra_tensors = {} if normalize_observations and X.dtype == tf.float32: encoded_x, rms = _normalize_clip_observation(X) extra_tensors['rms'] = rms else: encoded_x = X encoded_x = encode_observation( ob_space, encoded_x ) # Encode input in the way that is appropriate to the observation space(float) with tf.variable_scope('dyn%s' % index, reuse=tf.AUTO_REUSE): dynamics_latent = dynamics_network(encoded_x) if isinstance(dynamics_latent, tuple): dynamics_latent, recurrent_tensors = dynamics_latent if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert nenv > 0, 'Bad input for recurrent dynamics: batch size {} smaller than nsteps {}'.format( nbatch, nsteps) dynamics_latent, recurrent_tensors = dynamics_network( encoded_x, nenv) extra_tensors.update(recurrent_tensors) # print('dynamics%s'%character, train_dynamics_model[i]) ### original # with tf.variable_scope('dyn', reuse=tf.AUTO_REUSE): # dynamics_latent = dynamics_network(encoded_x) # if isinstance(dynamics_latent, tuple): # dynamics_latent, recurrent_tensors = dynamics_latent # # if recurrent_tensors is not None: # # recurrent architecture, need a few more steps # nenv = nbatch // nsteps # assert nenv > 0, 'Bad input for recurrent dynamics: batch size {} smaller than nsteps {}'.format(nbatch, nsteps) # dynamics_latent, recurrent_tensors = dynamics_network(encoded_x, nenv) # extra_tensors.update(recurrent_tensors) ### original delete tf.variable_scope (first line) # dynamics_latent = dynamics_network(encoded_x) # if isinstance(dynamics_latent, tuple): # dynamics_latent, recurrent_tensors = dynamics_latent # # if recurrent_tensors is not None: # # recurrent architecture, need a few more steps # nenv = nbatch // nsteps # assert nenv > 0, 'Bad input for recurrent dynamics: batch size {} smaller than nsteps {}'.format(nbatch, nsteps) # dynamics_latent, recurrent_tensors = dynamics_network(encoded_x, nenv) # extra_tensors.update(recurrent_tensors) # _v_net = value_network # # if _v_net is None or _v_net == 'shared': # vf_latent = dynamics_latent # else: # if _v_net == 'copy': # _v_net = dynamics_network # else: # assert callable(_v_net) # # with tf.variable_scope('dyn_vf', reuse=tf.AUTO_REUSE): # vf_latent = _v_net(encoded_x) dynamics = DynamicsWithValue( env=env, observations=X, latent=dynamics_latent, sess=sess, index=index, ### added **extra_tensors) return dynamics
def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None): print("Observation space being passed to policies.py {}".format( env.observation_space)) ob_space = env.observation_space print("Observation space is in policies.py {}".format(ob_space)) #placeholder list by Sai ph_list = [] if isinstace(ob_space, dict): #will accept dictionaries for now for eachKey in ob_space: ph_list.append( observation_placeholder(ob_space[eachKey], batch_size=nbatch)) print("The shape of the placeholder list (ph_list) in policies.py {}". format(ph_list.shape)) extra_tensors = {} if normalize_observations and ph_list[0].dtype == tf.float32: encoded_x, rms = _normalize_clip_observation(X) extra_tensors['rms'] = rms else: encoded_x = ph_list for eachKey in ob_space: count = 0 encoded_x[count] = (encode_observation(ob_space[eachKey], ph_list[count])) count += 1 with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent = policy_network( encoded_x) # encoded_x is the list of encoded ph # if isinstance(policy_latent, tuple): # policy_latent, recurrent_tensors = policy_latent # if recurrent_tensors is not None: # # recurrent architecture, need a few more steps # nenv = nbatch // nsteps # assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(nbatch, nsteps) # policy_latent, recurrent_tensors = policy_network(encoded_x, nenv) # extra_tensors.update(recurrent_tensors) _v_net = value_network if _v_net is None or _v_net == 'shared': vf_latent = policy_latent else: if _v_net == 'copy': _v_net = policy_network else: assert callable(_v_net) with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): # TODO recurrent architectures are not supported with value_network=copy yet vf_latent = _v_net(encoded_x) policy = PolicyWithValue(env=env, observations=ph_list, latent=policy_latent, vf_latent=vf_latent, sess=sess, estimate_q=estimate_q, **extra_tensors) return policy
def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None, mix_mode='nomix'): ob_space = env.observation_space extra_tensors = {} X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=None) if mix_mode in ['mixreg', 'mixobs']: COEFF = tf.placeholder(tf.float32, [None]) INDICES = tf.placeholder(tf.int32, [None]) OTHER_INDICES = tf.placeholder(tf.int32, [None]) coeff = tf.reshape(COEFF, (-1, 1, 1, 1)) encoded_x = tf.cast(X, tf.float32) encoded_x = coeff * tf.gather(encoded_x, INDICES, axis=0) + (1 - coeff) * tf.gather(encoded_x, OTHER_INDICES, axis=0) encoded_x = tf.cast(encoded_x, tf.uint8) extra_tensors['coeff'] = COEFF extra_tensors['indices'] = INDICES extra_tensors['other_indices'] = OTHER_INDICES elif mix_mode == 'nomix': encoded_x = X else: raise ValueError(f"Unknown mixing mode: {mix_mode} !") encoded_x = encode_observation(ob_space, encoded_x) with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent = policy_network(encoded_x) if isinstance(policy_latent, tuple): policy_latent, recurrent_tensors = policy_latent if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(nbatch, nsteps) policy_latent, recurrent_tensors = policy_network(encoded_x, nenv) extra_tensors.update(recurrent_tensors) _v_net = value_network if _v_net is None or _v_net == 'shared': vf_latent = policy_latent else: if _v_net == 'copy': _v_net = policy_network else: assert callable(_v_net) with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): # TODO recurrent architectures are not supported with value_network=copy yet vf_latent = _v_net(encoded_x) policy = PolicyWithValue( env=env, observations=X, latent=policy_latent, vf_latent=vf_latent, sess=sess, estimate_q=estimate_q, **extra_tensors ) return policy
def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None): ob_space = env.observation_space X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch) #这里对输入的X进行镜像,X为batch_size*84*84*4,axis=1为上下镜像,axis=2为左右镜像 X_mirror = tf.reverse(X,axis=[1]) extra_tensors = {} if normalize_observations and X.dtype == tf.float32: encoded_x, rms = _normalize_clip_observation(X) #下方所有加mirror的都是添加的,和正常的状态过同样的计算流程 encoded_x_mirror, _ = _normalize_clip_observation(X_mirror) extra_tensors['rms'] = rms else: encoded_x = X encoded_x_mirror = X_mirror encoded_x = encode_observation(ob_space, encoded_x) encoded_x_mirror = encode_observation(ob_space,encoded_x_mirror) with tf.variable_scope('pi', reuse=tf.AUTO_REUSE): policy_latent = policy_network(encoded_x) #过同样的策略网络 policy_latent_mirror = policy_network(encoded_x_mirror) if isinstance(policy_latent, tuple): policy_latent, recurrent_tensors = policy_latent policy_latent_mirror, recurrent_tensors_mirror = policy_latent_mirror if recurrent_tensors is not None: # recurrent architecture, need a few more steps nenv = nbatch // nsteps assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(nbatch, nsteps) policy_latent, recurrent_tensors = policy_network(encoded_x, nenv) policy_latent_mirror,recurrent_tensors_mirror = policy_network(encoded_x_mirror,nenv) extra_tensors.update(recurrent_tensors) _v_net = value_network if _v_net is None or _v_net == 'shared': vf_latent = policy_latent else: if _v_net == 'copy': _v_net = policy_network else: assert callable(_v_net) with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): # TODO recurrent architectures are not supported with value_network=copy yet vf_latent = _v_net(encoded_x) policy = PolicyWithValue( env=env, observations=X, latent=policy_latent, #mirrorlatent为镜像后过策略神经但是没有经过全连接层的输出,后面镜像的价值网络和策略网络共享这个,所以没有定义一个vf_latent_mirror mirrorlatent = policy_latent_mirror, vf_latent=vf_latent, sess=sess, estimate_q=estimate_q, **extra_tensors ) return policy