def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) self.pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.uint8, ob_shape) #obs with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X, **conv_kwargs) vf = fc(h, 'v', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 ob_shape = (nbatch,) + ob_space.shape self.pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs with tf.variable_scope("model", reuse=reuse): activ = tf.tanh flatten = tf.layers.flatten pi_h1 = activ(fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) vf_h1 = activ(fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): X, processed_x = observation_input(ob_space, nbatch) activ = tf.tanh processed_x = tf.layers.flatten(processed_x) pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors): """ Parameters: ---------- env RL environment observations tensorflow placeholder in which the observations will be fed latent latent state from which policy distribution parameters should be inferred vf_latent latent state from which value function should be inferred (if None, then latent is used) sess tensorflow session to run calculations in (if None, default session is used) **tensors tensorflow tensors for additional attributes such as state or mask """ self.X = observations self.state = tf.constant([]) self.initial_state = None self.__dict__.update(tensors) vf_latent = vf_latent if vf_latent is not None else latent vf_latent = tf.layers.flatten(vf_latent) latent = tf.layers.flatten(latent) # Based on the action space, will select what probability distribution type self.pdtype = make_pdtype(env.action_space) self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01) # Take an action self.action = self.pd.sample() # Calculate the neg log of our probability self.neglogp = self.pd.neglogp(self.action) self.sess = sess or tf.get_default_session() if estimate_q: assert isinstance(env.action_space, gym.spaces.Discrete) self.q = fc(vf_latent, 'q', env.action_space.n) self.vf = self.q else: self.vf = fc(vf_latent, 'vf', 1) self.vf = self.vf[:,0]
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact) vf = fc(h5, 'v', 1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(processed_x, **conv_kwargs) vf = fc(h, 'v', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 ob_shape = (nbatch,) + ob_space.shape actdim = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs with tf.variable_scope("model", reuse=reuse): activ = tf.tanh h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) pi = fc(h2, 'pi', actdim, init_scale=0.01) h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(h2, 'vf', 1)[:,0] logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, obs_name='ob',\ obrms=True, final_std=0.01, init_logstd=0.0, observation_permutation=None,action_permutation=None, soft_mirror=False): assert isinstance(ob_space, gym.spaces.Box) obs_perm_mat = np.zeros( (len(observation_permutation), len(observation_permutation)), dtype=np.float32) self.obs_perm_mat = obs_perm_mat for i, perm in enumerate(observation_permutation): obs_perm_mat[i][int(np.abs(perm))] = np.sign(perm) if isinstance(ac_space, gym.spaces.Box): act_perm_mat = np.zeros( (len(action_permutation), len(action_permutation)), dtype=np.float32) self.act_perm_mat = act_perm_mat for i, perm in enumerate(action_permutation): self.act_perm_mat[i][int(np.abs(perm))] = np.sign(perm) elif isinstance(ac_space, gym.spaces.MultiDiscrete): total_dim = int(np.sum(ac_space.nvec)) dim_index = np.concatenate([[0], np.cumsum(ac_space.nvec)]) act_perm_mat = np.zeros((total_dim, total_dim), dtype=np.float32) self.act_perm_mat = act_perm_mat for i, perm in enumerate(action_permutation): perm_mat = np.identity(ac_space.nvec[i]) if np.sign(perm) < 0: perm_mat = np.flipud(perm_mat) self.act_perm_mat[ dim_index[i]:dim_index[i] + ac_space.nvec[i], dim_index[int(np.abs(perm) )]:dim_index[int(np.abs(perm))] + ac_space.nvec[int(np.abs(perm))]] = perm_mat self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None print(self.pdtype) print([sequence_length] + list(ob_space.shape)) ob = U.get_placeholder(name=obs_name, dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) mirror_ob = tf.matmul(ob, obs_perm_mat) mirror_obz = tf.clip_by_value( (mirror_ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) if not obrms: obz = ob last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] if isinstance(ac_space, gym.spaces.Box): pol_net = GenericFF('pol_net', ob_space.shape[0], [], pdtype.param_shape()[0] // 2, hid_size, num_hid_layers) elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol_net = GenericFF('pol_net', ob_space.shape[0], [], pdtype.param_shape()[0], hid_size, num_hid_layers) orig_out = pol_net.get_output_tensor(obz, None, tf.nn.tanh) mirr_out = tf.matmul( pol_net.get_output_tensor(mirror_obz, None, tf.nn.tanh), act_perm_mat) if not soft_mirror: mean = orig_out + mirr_out else: mean = orig_out self.additional_loss = tf.reduce_mean( tf.abs(orig_out - mirr_out)) * 1.0 if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.constant_initializer(init_logstd)) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = mean self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, ac_space, X, hidden_size, n_layers=2, activation="tanh", value_baseline=False, scope='MlpPolicy', reuse=False, X_placeholder=None, fix_variance=False, init_logstd=None): """ Gaussian Policy. The variance is learned as parameters. You can also pass in the logstd from the outside. __init__: Construct the graph for the MLP policy. :param ac_space: action space, one of `gym.spaces.Box` :param X: Tensor or input placeholder for the observation :param hidden_size: size of hidden layers in network :param activation: one of 'reLU', 'tanh' :param scope: str, name of variable scope. :param reuse: :param value_baseline: bool flag whether compute a value baseline :param X_placeholder: :param fix_variance: :param init_logstd: """ assert n_layers >= 2, f"hey, what's going on with this puny {n_layers}-layer network? " \ f"--Ge (your friendly lab-mate)" if isinstance(scope, tf.VariableScope): self.scope_name = scope.name else: self.scope_name = scope self.name = (self.scope_name + "_reuse") if reuse else self.scope_name self.X_ph = X if X_placeholder is None else X_placeholder # done: this only applies to Discrete action space. Need to make more general. # now it works for both discrete action and gaussian policies. if isinstance(ac_space, spaces.Discrete): act_dim = ac_space.n else: act_dim, *_ = ac_space.shape if activation == 'tanh': act = tf.tanh elif activation == "relu": act = tf.nn.relu else: raise TypeError(f"{activation} is not available in this MLP.") with tf.variable_scope(scope, reuse=reuse): h_ = X for i in range(1, n_layers + 1): # there is no off-by-one error here --Ge. h_ = fc(h_, f'pi_fc_{i}', nh=hidden_size, init_scale=np.sqrt(2), act=act) # a_ = fc(h_, f'pi_attn_{i}', nh=h_.shape[1], init_scale=np.sqrt(2), act=tf.math.sigmoid) # h_ = fc(h_ * a_, f'pi_fc_{i}', nh=hidden_size, init_scale=np.sqrt(2), act=act) mu = fc(h_, 'pi', act_dim, act=lambda x: x, init_scale=0.01) # _ = fc(h2, 'pi', act_dim, act=tf.tanh, init_scale=0.01) # mu = ac_space.low + 0.5 * (ac_space.high - ac_space.low) * (_ + 1) self.h_ = h_ # used for learned loss # assert (not G.vf_coef) ^ (G.baseline == "critic"), "These two can not be true or false at the same time." if value_baseline: # todo: conditionally declare these only when used # h1 = fc(X, 'vf_fc1', nh=hidden_size, init_scale=np.sqrt(2), act=act) # h2 = fc(h1, 'vf_fc2', nh=hidden_size, init_scale=np.sqrt(2), act=act) self.vf = fc(self.h_, 'vf', 1, act=lambda x: x)[:, 0] if isinstance(ac_space, spaces.Box): # gaussian policy requires logstd shape = tf.shape(mu)[0] if fix_variance: _ = tf.ones(shape=[1, act_dim], name="unit_logstd") * (init_logstd or 0) logstd = tf.tile(_, [shape, 1]) elif init_logstd is not None: _ = tf.get_variable( name="logstd", shape=[1, act_dim], initializer=tf.constant_initializer(init_logstd)) # todo: clip logstd to limit the range. logstd = tf.tile(_, [shape, 1]) else: # use variance network when no initial logstd is given. # _ = fc(X, 'logstd_fc1', nh=hidden_size, init_scale=np.sqrt(2), act=act) # _ = fc(_, 'logstd_fc2', nh=hidden_size, init_scale=np.sqrt(2), act=act) # note: this doesn't work. Really need to bound the variance. # logstd = 1 + fc(self.h_, 'logstd', act_dim, act=lambda x: x, init_scale=0.01) logstd = fc(self.h_, 'logstd', act_dim, act=lambda x: x, init_scale=0.01) # logstd = fc(self.h2, 'logstd', act_dim, act=tf.tanh, init_scale=0.01) # logstd = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (logstd + 1) # GaussianPd takes 2 * [act_length] b/c of the logstd concatenation. ac = tf.concat([mu, logstd], axis=1) # A much simpler way is to multiply _logstd with a zero tensor shaped as mu. # [mu, mu * 0 + _logstd] else: raise NotImplemented( 'Discrete action space is not implemented!') # list of parameters is fixed at graph time. # todo: Only gets trainables that are newly created by the current policy function. # self.trainables = tf.trainable_variables() # placeholders = placeholders_from_variables(self.trainables) # self._assign_placeholder_dict = {t.name: p for t, p in zip(self.trainables, placeholders)} # self._assign_op = tf.group(*[v.assign(p) for v, p in zip(self.trainables, placeholders)]) with tf.variable_scope("Gaussian_Action"): self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(ac) self.a = a = self.pd.sample() self.mu = self.pd.mode() self.neglogpac = self.pd.neglogp(a)
def __init__(self, sess, ob_space, sensor_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 ob_shape = (nbatch, ) + ob_space.shape ob_sensor_shape = (nbatch, ) + sensor_space.shape actdim = ac_space.shape[0] X_camera = tf.placeholder(tf.uint8, ob_shape, name='Ob_camera') #obs X_sensor = tf.placeholder(tf.float32, ob_sensor_shape, name='Ob_sensor') self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): h_camera = conv(tf.cast(X_camera, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2_camera = conv(h_camera, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3_camera = conv(h2_camera, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3_camera = conv_to_fc(h3_camera) h4_camera = fc(h3_camera, 'fc1', nh=512, init_scale=np.sqrt(2)) pi_camera = fc(h4_camera, 'pi', actdim, init_scale=0.01) vf_camera = fc(h4_camera, 'v', 1)[:, 0] self.pd = self.pdtype.pdfromflat(pi_camera) with tf.variable_scope("model_sensor", reuse=reuse): h1_sensor = fc(X_sensor, 'pi_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh) h2_sensor = fc(h1_sensor, 'pi_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh) pi_sensor = fc(h2_sensor, 'pi', actdim, init_scale=0.01) h1_sensor = fc(X_sensor, 'vf_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh) h2_sensor = fc(h1_sensor, 'vf_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh) vf_sensor = fc(h2_sensor, 'vf', 1)[:, 0] with tf.variable_scope("model", reuse=reuse): logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) X = tf.concat([X_camera, X_sensor], 0) pi_full = tf.concat([pi_camera, pi_sensor], 0) pi = fc(pi_full, 'pi', actdim, init_scale=0.01) vf_full = tf.concat([vf_camera, vf_sensor], 0) vf = fc(vf_full, 'vf', 1)[:, 0] pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparam) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, ob_sensor, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], { X_camera: ob, X_sensor: ob_sensor }) return a, v, self.initial_state, neglogp def value(ob, ob_sensor, *_args, **_kwargs): return sess.run(vf, {X_camera: ob, X_sensor: ob_sensor}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) self.rep_loss = None # explicitly create vector space for latent vectors latent_space = Box(-np.inf, np.inf, shape=(256,)) # So that I can compute the saliency map if Config.REPLAY: X = tf.compat.v1.placeholder(shape=(nbatch,) + ob_space.shape, dtype=np.float32, name='Ob') processed_x = X else: X, processed_x = observation_input(ob_space, None) TRAIN_NUM_STEPS = Config.NUM_STEPS//16 REP_PROC = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, 84, 84, 3), name='Rep_Proc') Z_INT = tf.compat.v1.placeholder(dtype=tf.int32, shape=(), name='Curr_Skill_idx') Z = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, Config.N_SKILLS), name='Curr_skill') CLUSTER_DIMS = 128 HIDDEN_DIMS_SSL = 256 self.protos = tf.compat.v1.Variable(initial_value=tf.random.normal(shape=(CLUSTER_DIMS, Config.N_SKILLS)), trainable=True, name='Prototypes') self.A = self.pdtype.sample_placeholder([None],name='A') # trajectories of length m, for N policy heads. self.STATE = tf.compat.v1.placeholder(tf.float32, [None,84,84,3], name='State') self.STATE_NCE = tf.compat.v1.placeholder(tf.float32, [Config.REP_LOSS_M,1,None,84,84,3], name='State_NCE') self.ANCH_NCE = tf.compat.v1.placeholder(tf.float32, [None,84,84,3], name='ANCH_NCE') # labels of Q value quantile bins self.LAB_NCE = tf.compat.v1.placeholder(tf.float32, [Config.POLICY_NHEADS,None], name='Labels') self.A_i = self.pdtype.sample_placeholder([None,Config.REP_LOSS_M,1],name='A_i') self.R_cluster = tf.compat.v1.placeholder(tf.float32, [None], name='R_cluster') # 1056 self.A_cluster = self.pdtype.sample_placeholder([None], name='A_cluster') X = REP_PROC #tf.reshape(REP_PROC, [-1, 64, 64, 3]) with tf.compat.v1.variable_scope("target", reuse=tf.compat.v1.AUTO_REUSE): act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn(REP_PROC) self.train_dropout_assign_ops = fast_dropout_assigned_ops self.run_dropout_assign_ops = slow_dropout_assign_ops with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): self.h = tf.concat([act_condit, act_invariant], axis=1) """ Clustering part """ N_ACTIONS = 5 if Config.ENVIRONMENT == 'ising' else 15 with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): # h_codes: n_batch x n_t x n_rkhs act_condit, act_invariant, _, _ = choose_cnn(REP_PROC) self.h_codes = tf.transpose(tf.reshape(self.h,[-1,Config.NUM_ENVS,256]),(1,0,2)) act_one_hot = tf.transpose(tf.reshape(self.A_cluster, [-1,17,ac_space.shape[0]]), (1,0,2)) # tf.one_hot on action clusters gives (1056, 15) # after reshape it's (33, 32, 15) # after transpose it's (32, 33, 15) # cts will be (32, 33, X) #act_one_hot = tf.transpose(tf.reshape(tf.one_hot(self.A_cluster,ac_space.n),[-1,Config.NUM_ENVS,ac_space.n]),(1,0,2)) h_acc = [] h_acc_no_act = [] for k in range(Config.CLUSTER_T): h_t = self.h_codes[:,k:tf.shape(self.h_codes)[1]-(Config.CLUSTER_T-k-1)] self.dummy_ht = h_t # if k = 0 , T = 50, then 0: 49 # TODO(Ahmed) Ask Bogdan what exactly this action subsampling/indexing is carried out a_t = act_one_hot[:,k:tf.shape(act_one_hot)[1]-(Config.CLUSTER_T-k-1) -1] #expand reshaping line by line for easier debugging h_t_reshaped = tf.reshape(h_t,(-1,256)) a_t_reshaped = tf.reshape(a_t,(-1, ac_space.shape[0])) h_t_final = tf.expand_dims(tf.expand_dims(h_t_reshaped,1),1)[1:] # TODO(Ahmed) ensure that reshape from 256 to 255 doesn't break downstream h_t_film = h_t_final # h_t_film = tf.reshape(FiLM(widths=[128], name='FiLM_layer')([h_t_final, a_t_reshaped])[:,0,0],(Config.NUM_ENVS,-1,255)) h_acc_no_act.append(tf.reshape(h_t,(Config.NUM_ENVS,-1,256))) h_acc.append(h_t_film) # h_seq_no_act = tf.reshape( tf.concat(h_acc_no_act,2), (-1,256*Config.CLUSTER_T)) h_seq = tf.reshape( tf.concat(h_acc,2), (-1,256*Config.CLUSTER_T)) self.h_seq = h_seq # self.z_t_no_act = get_online_predictor(n_in=256*Config.CLUSTER_T,n_out=CLUSTER_DIMS,prefix='SH_z_pred_no_act')(h_seq_no_act) self.z_t = get_online_predictor(n_in=256*Config.CLUSTER_T,n_out=CLUSTER_DIMS,prefix='SH_z_pred')(h_seq) self.u_t = get_predictor(n_in=CLUSTER_DIMS,n_out=CLUSTER_DIMS,prefix='SH_u_pred')(self.z_t) self.z_t_1 = self.z_t # scores: n_batch x n_clusters # tf.linalg.normalize(self.z_t_1, axis=1, ord='euclidean')[0] # tf.linalg.normalize(self.protos, axis=1, ord='euclidean')[0] scores = tf.linalg.matmul(tf.linalg.normalize(self.z_t_1, axis=1, ord='euclidean')[0],tf.linalg.normalize(self.protos, axis=1, ord='euclidean')[0]) self.codes = sinkhorn(scores=scores) self.myow_loss = 0. if Config.MYOW: """ MYOW where k-NN neighbors are replaced by Sinkhorn clusters """ # with tf.compat.v1.variable_scope("random", reuse=tf.compat.v1.AUTO_REUSE): # # h_codes: n_batch x n_t x n_rkhs # act_condit_target, act_invariant_target, _, _ = choose_cnn(X) # h_codes_target = tf.transpose(tf.reshape(tf.concat([act_condit_target, act_invariant_target], axis=1),[-1,Config.NUM_ENVS,256]),(1,0,2)) # h_t_target = h_codes_target[:,:-1] # h_tp1_target = h_codes_target[:,1:] # # h_a_t = tf.transpose(tf.reshape(get_predictor(n_in=ac_space.n,n_out=256,prefix="SH_a_emb")( act_one_hot), (-1,Config.NUM_ENVS,256)), (1,0,2)) # h_seq_target = tf.reshape( tf.concat([h_t_target,h_tp1_target],2), (-1,256*Config.CLUSTER_T)) # act_one_hot_target = tf.reshape(tf.one_hot(self.A_cluster,ac_space.n), (-1,ac_space.n)) # h_seq_target = tf.squeeze(tf.squeeze(FiLM(widths=[512,512], name='FiLM_layer')([tf.expand_dims(tf.expand_dims(h_seq_target,1),1), act_one_hot_target]),1),1) y_online = h_seq y_target = tf.stop_gradient(h_seq) # y_reward = tf.reshape(self.R_cluster,(-1,1)) # Find cluster adjacency scores dist = _compute_distance(tf.transpose(self.protos),tf.transpose(self.protos)) k_t = Config.N_KNN vals, indx = tf.nn.top_k(-dist, k_t+1,sorted=True) cluster_idx = tf.cast(tf.argmax(scores,1),tf.int32) cluster_membership_list = [] for i in range(Config.N_SKILLS): filter_ = tf.cast(tf.fill(tf.shape(cluster_idx), i),tf.int32) mask = tf.math.equal(filter_ , cluster_idx) cluster_vecs = tf.cast(tf.where(mask),tf.int32) cluster_vecs = tf.cond(tf.math.equal(tf.shape(cluster_vecs)[0],0),lambda :tf.constant([[0]],tf.int32),lambda :cluster_vecs) # cluster_idx = tf.cast(tf.round(tf.random.uniform((1,),maxval=tf.cast(tf.shape(cluster_vecs),tf.float32))[0]),tf.int32) # randomly sample a vector from its cluster cluster_membership_list.append(cluster_vecs[0]) # take first vector of this cluster as representative cluster_membership_list = tf.stack(cluster_membership_list) # import ipdb;ipdb.set_trace() # N_target = y_target with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): v_online_net = get_predictor(n_in=256*Config.CLUSTER_T,n_out=HIDDEN_DIMS_SSL,prefix='MYOW_v_pred') r_online_net = get_predictor(n_in=HIDDEN_DIMS_SSL,n_out=HIDDEN_DIMS_SSL,prefix='MYOW_r_pred') v_online = v_online_net(y_online) r_online = r_online_net(v_online) with tf.compat.v1.variable_scope("target", reuse=tf.compat.v1.AUTO_REUSE): v_target_net = get_predictor(n_in=256*Config.CLUSTER_T,n_out=HIDDEN_DIMS_SSL,prefix='MYOW_v_pred') r_target_net = get_predictor(n_in=HIDDEN_DIMS_SSL,n_out=HIDDEN_DIMS_SSL,prefix='MYOW_r_pred') for k in range(k_t): nearby_cluster_idx = tf.gather(indx[:,k+1],cluster_idx) nearby_batch_vecs = tf.reshape(tf.gather(cluster_membership_list,tf.cast(nearby_cluster_idx,tf.int32)),(-1,)) N_target = tf.gather(y_target, nearby_batch_vecs) v_target = v_target_net(N_target) # r_target = r_target_net(v_target) self.myow_loss += tf.reduce_mean(cos_loss(r_online, v_target)) #+ tf.reduce_mean(cos_loss(r_target, v_online)) # with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): # phi_s = get_online_predictor(n_in=256,n_out=CLUSTER_DIMS,prefix='SH_z_pred')(tf.reshape(h_acc[-1],(-1,256))) # self.myow_loss += tf.reduce_mean(cos_loss(phi_s, tf.transpose(tf.gather(self.protos,cluster_idx,axis=1),(1,0)) )) """ Intrinsic rewards """ with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): self.R_I_SCALE = tf.nn.relu(get_linear_layer(n_in=256,n_out=1,prefix='r_i_scale',init=initializers.RandomNormal(stddev=0.11))(tf.reshape(tf.stop_gradient(h_acc[-1]),(-1,256)))) # self.h = get_predictor(n_in=256+Config.N_SKILLS,n_out=256)(tf.concat([self.h,tf.stop_gradient(scores)],1)) """ Condition on soft-cluster assignments for policy head (Cluster Conditioned Policy ) """ if Config.CLUSTER_CONDIT_POLICY: concat_code = tf.stop_gradient(tf.reshape(self.codes, [-1, Config.N_SKILLS])) # print(self.h) # print(concat_code) #self.h = tf.concat([self.h, concat_code], axis=1) #h_seq = tf.squeeze(tf.squeeze(FiLM(widths=[512,512], name='FiLM_layer')([tf.expand_dims(tf.expand_dims(h_seq,1),1), act_one_hot]),1),1) with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): if Config.CUSTOM_REP_LOSS and Config.POLICY_NHEADS > 1: self.pd_train = [] for i in range(Config.POLICY_NHEADS): with tf.compat.v1.variable_scope("head_"+str(i), reuse=tf.compat.v1.AUTO_REUSE): self.pd_train.append(self.pdtype.pdfromlatent(self.h, init_scale=0.01)[0]) with tf.compat.v1.variable_scope("head_i", reuse=tf.compat.v1.AUTO_REUSE): self.pd_train_i = self.pdtype.pdfromlatent(self.h, init_scale=0.01)[0] else: with tf.compat.v1.variable_scope("head_0", reuse=tf.compat.v1.AUTO_REUSE): self.pd_train = self.pdtype.pdfromlatent(self.h, init_scale=0.01)[0] if Config.CUSTOM_REP_LOSS and Config.POLICY_NHEADS > 1: # self.vf_train = [fc(self.h, 'v'+str(i), 1)[:, 0] for i in range(Config.POLICY_NHEADS)] self.vf_train = [fc(self.h, 'v_0', 1)[:, 0] ] else: self.dummy_vf_train = fc(self.h, 'v_0', 1) self.dummy_vf_train_curr = fc(self.h, 'v_0', 1)[:,0] self.vf_train = [fc(self.h, 'v_0', 1)[:, 0] ] self.vf_i_train = fc(tf.stop_gradient(self.h), 'v_i', 1)[:, 0] self.vf_i_run = self.vf_i_train # Plain Dropout version: Only fast updates / stochastic latent for VIB self.pd_run = self.pd_train self.vf_run = self.vf_train # For Dropout: Always change layer, so slow layer is never used self.run_dropout_assign_ops = [] # Use the current head for classical PPO updates # normalize policies to (-1, 1) for DM control a0_run_pre_clamp = self.pd_run.sample() neglogp0_run_pre_clamp = self.pd_run.neglogp(a0_run_pre_clamp) # normalize policies to (-1, 1) for DM control a0_run = tf.math.tanh(a0_run_pre_clamp) # after applying tanh, rescale loglikehoods as well. # Assuming X ~ Normal() and Y = tanh(X) then log p(Y) = log p(X) - log dy / dx neglogp0_run = neglogp0_run_pre_clamp + tf.reduce_sum(tf.math.log( (1 - a0_run ** 2) + 1e-7),keepdims=True) #(2. * (np.log(2.) - a0_run_pre_clamp[idx] - tf.nn.softplus(-2. * a0_run_pre_clamp[idx]))) self.initial_state = None def step(ob, update_frac, skill_idx=None, one_hot_skill=None, nce_dict = {}, *_args, **_kwargs): if Config.REPLAY: ob = ob.astype(np.float32) a, v, neglogp = sess.run([a0_run, self.vf_run, neglogp0_run], {REP_PROC: ob, Z: one_hot_skill}) # sess.run([neglogp0_run[0]], {REP_PROC: ob}) return a, v, 0., self.initial_state, neglogp def rep_vec(ob, *_args, **_kwargs): return sess.run(self.h, {X: ob}) def value(ob, update_frac, one_hot_skill=None, *_args, **_kwargs): return sess.run(self.vf_run, {REP_PROC: ob, Z: one_hot_skill}) def value_i(ob, update_frac, one_hot_skill=None, *_args, **_kwargs): return sess.run(self.vf_i_run, {REP_PROC: ob, Z: one_hot_skill}) def nce_fw_pass(nce_dict): return sess.run([self.vf_i_run,self.rep_loss],nce_dict) def custom_train(ob, rep_vecs): return sess.run([self.rep_loss], {X: ob, REP_PROC: rep_vecs})[0] def compute_codes(ob,act): return sess.run([tf.reshape(self.codes , (Config.NUM_ENVS,Config.NUM_STEPS,-1)), tf.reshape(self.u_t , (Config.NUM_ENVS,Config.NUM_STEPS,-1)), tf.reshape(self.z_t_1 , (Config.NUM_ENVS,Config.NUM_STEPS,-1)) , self.h_codes[:,1:]], {REP_PROC: ob, self.A_cluster: act}) def compute_hard_codes(ob): return sess.run([self.codes, self.u_t, self.z_t_1], {REP_PROC: ob}) def compute_cluster_returns(returns): return sess.run([self.cluster_returns],{self.R_cluster:returns}) self.X = X self.processed_x = processed_x self.step = step self.value = value self.value_i = value_i self.rep_vec = rep_vec self.custom_train = custom_train self.nce_fw_pass = nce_fw_pass self.encoder = choose_cnn self.REP_PROC = REP_PROC self.Z = Z self.compute_codes = compute_codes self.compute_hard_codes = compute_hard_codes self.compute_cluster_returns = compute_cluster_returns
def train(train, restore): # Initialize the environment env = make_mujoco_env("Reacher-v2", 0) # new session sess = tf.Session() pdtype = make_pdtype(env.action_space) # initialize teacher agent teacher = TeacherAgent(env, sess, True, batch=1) # This observation placeholder is for querying teacher action ob_ph = U.get_placeholder(name="ob", dtype=tf.float32, shape=[1, env.observation_space.shape[0]]) with tf.variable_scope("LSTM"): # different from ob_ph, this tf placeholder holds a batch of observations for lstm training ob_batch_ph = tf.placeholder( name="ob_batch_ph", dtype=tf.float32, shape=[STEPS_UNROLLED, LSTM_BATCH_SIZE, OBSPACE_SHAPE]) prev_pdflat_batch_ph = tf.placeholder( name="prev_pdflat_batch_ph", dtype=tf.float32, shape=[STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE]) #prev_rew_batch_ph = tf.placeholder( name="prev_rew_batch_ph", dtype=tf.float32, # shape=[ STEPS_UNROLLED, MLP_BATCH_SIZE, 1 ] ) keep_prob_ph = tf.placeholder(name="keep_prob_ph", dtype=tf.float32, shape=[]) # ou#ter dim is 2 because of c_state and m state initial_state_batch_ph = tf.placeholder( shape=[2, LSTM_BATCH_SIZE, NUM_UNITS], dtype=tf.float32) # lstm graph; shape of s_pdflat_batch:[STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE] s_pdflat_batch, final_state_batch = student_lstm_graph( ob_batch_ph, keep_prob_ph, prev_pdflat_batch_ph, initial_state_batch_ph) t_pdflat_batch_ph = tf.placeholder( name="t_pdflat_batch_ph", shape=[STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE], dtype=tf.float32) # get student action wrt last observation # beginning at last index (i.e. STEPS_UNROLLED-1 ), sample down 1 element in the first (outer) dimension # , and all elements in the inner dimensions s_pdflat_slice = tf.slice(s_pdflat_batch, [(STEPS_UNROLLED - 1), (LSTM_BATCH_SIZE - 1), 0], [1, 1, -1]) # for stepping s_action = pdtype.pdfromflat(s_pdflat_slice).mean # get a collection of students within the 'LSTM' scope for optimization student_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="LSTM") loss = kl_loss(s_pdflat_batch, t_pdflat_batch_ph, pdtype) with tf.name_scope("adam"): # adam optimizer for minimize kl loss; learning rate is fixed here adam = tf.train.AdamOptimizer(learning_rate=1e-3, beta1=0.9, beta2=0.999, epsilon=1e-8) minimize_adam = adam.minimize(loss, var_list=student_var) # initializer; to be placed at the very end init = tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="LSTM")) # saver for restoring/saving depending on whether or not to train #saver = tf.train.Saver( # var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='LSTM') ) train_writer = tf.summary.FileWriter("/home/winstonww/reacher/data/viz/1") train_writer.add_graph(sess.graph) # state variables for lstm zero_state_batch = np.zeros([2, LSTM_BATCH_SIZE, NUM_UNITS]) curr_state_batch = zero_state_batch with sess: # run initializer for adam optimizer sess.run(tf.variables_initializer(adam.variables())) # run initializer for lstm variables if not restore: sess.run(init) #elif glob.glob( lstm_trained_data_path + "*" ): # saver.restore( sess, lstm_trained_data_path ) else: print("attempt to restore trained data but {0} does not exist". format(lstm_trained_data_path)) dataset = Dataset(dir_path=dataset_path) # reset env ob = env.reset() reward = 0 if train: # in this loop we accumulate enough teacher data to get us started print("Begin Training! First Accumulate observation with teacher") while dataset.num_episodes() <= LSTM_BATCH_SIZE * 2: # accumulate observations and teacher action data t_mean, t_pdflat = sess.run( (teacher.pi.pd.mean, teacher.pi.pd.flat), feed_dict={ob_ph: np.expand_dims(ob, axis=0)}) dataset.write(ob=ob, reward=reward, t_pdflat=t_pdflat, s_pdflat=np.zeros([PDFLAT_SHAPE]), stepped_with='t') ob, reward, new, _ = env.step(t_mean) if new: ob = env.reset() dataset.flush() print("Accumulated sufficient data points from teacher. now train") while True: total_loss = 0 s = zero_state_batch # BPTT print("BPTT") for (ob_batch_array, t_pdflat_batch_array, prev_pdflat_batch_array, prev_rew_batch_array) in dataset.training_batches(): # minimize loss to train student l, s, _ = sess.run( [loss, final_state_batch, minimize_adam], feed_dict={ keep_prob_ph: KEEP_PROB, ob_batch_ph: ob_batch_array, # TODO: revert this back #prev_pdflat_batch_ph: prev_pdflat_batch_array, #prev_rew_batch_ph: prev_rew_batch_array, #prev_pdflat_batch_ph: ob_batch_array, #prev_pdflat_batch_ph: np.random.rand(STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE), #prev_pdflat_batch_ph: np.zeros([STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE]), t_pdflat_batch_ph: t_pdflat_batch_array, initial_state_batch_ph: s }) total_loss += l print("DONE") # Get Teacher action for the last observation new = None while not new: t_pdflat = sess.run( (teacher.pi.pd.flat), feed_dict={ob_ph: np.expand_dims(ob, axis=0)}) ob_batch_array, prev_pdflat_batch_array, prev_rew_batch_array = dataset.test_batch( ob) # Get student action for the last ovservation s_ac, s_pdflat, curr_state_batch = sess.run( (s_action, s_pdflat_slice, final_state_batch), feed_dict={ keep_prob_ph: 1, ob_batch_ph: ob_batch_array, #TODO: revert this back #prev_pdflat_batch_ph: prev_pdflat_batch_array, #prev_rew_batch_ph: prev_rew_batch_array, #prev_pdflat_batch_ph: ob_batch_array, #prev_pdflat_batch_ph: np.random.rand(STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE), #prev_pdflat_batch_ph: np.zeros([STEPS_UNROLLED, LSTM_BATCH_SIZE, PDFLAT_SHAPE]), initial_state_batch_ph: curr_state_batch }) dataset.write(ob=ob, reward=reward, t_pdflat=t_pdflat, s_pdflat=s_pdflat, stepped_with='s') # step with student ob, reward, new, _ = env.step(s_ac) if new: print("************** Episode {0} ****************". format(dataset.num_episodes())) ob = env.reset() print("recent loss: %f " % total_loss) dataset.flush() #save_path = saver.save(sess, lstm_trained_data_path ) if dataset.num_episodes() % MAX_CAPACITY == 0: dataset.dump() if dataset.num_episodes() == 5000: break
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Dict) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob_config = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.spaces['joint'].shape)) ob_target = U.get_placeholder(name="goal", dtype=tf.float32, shape=[sequence_length] + list(ob_space.spaces['target'].shape)) obs_pos = U.get_placeholder( name="obs_pos", dtype=tf.float32, shape=[sequence_length] + list(ob_space.spaces['obstacle_pos1'].shape)) #is_training = U.get_placeholder(name="bn_training", dtype=tf.bool, shape=()) # construct v function model '''with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space['joint'].shape) obz = tf.clip_by_value((ob_config - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz goal_last_out = tf.clip_by_value((ob_target - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)''' last_out = ob_config goal_last_out = ob_target obs_last_out = obs_pos for i in range(num_hid_layers): last_out = dense(last_out, hid_size, "vfcfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #last_out = tf.layers.batch_normalization(last_out, training=is_training, name="vfcbn%i"%(i+1)) last_out = tf.nn.tanh(last_out) goal_last_out = dense(goal_last_out, hid_size, "vfgfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #goal_last_out = tf.layers.batch_normalization(goal_last_out, training=is_training, name="vfgbn%i" % (i + 1)) goal_last_out = tf.nn.tanh(goal_last_out) obs_last_out = dense(obs_last_out, hid_size, "vfobsfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #obs_last_out = tf.layers.batch_normalization(obs_last_out, training=is_training, name="vfobn%i"%(i+1)) obs_last_out = tf.nn.tanh(obs_last_out) vpred = tf.concat([last_out, goal_last_out, obs_last_out], -1) self.vpred = dense(vpred, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] # construct policy probability distribution model last_out = ob_config goal_last_out = ob_target obs_last_out = obs_pos for i in range(num_hid_layers): last_out = dense(last_out, hid_size, "pol_cfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #last_out = tf.layers.batch_normalization(last_out, training=is_training, name="pol_cbn%i"%(i+1)) last_out = tf.nn.tanh(last_out) goal_last_out = dense(goal_last_out, hid_size, "pol_gfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #goal_last_out = tf.layers.batch_normalization(goal_last_out, training=is_training, name="pol_gbn%i" % (i + 1)) goal_last_out = tf.nn.tanh(goal_last_out) obs_last_out = dense(obs_last_out, hid_size, "pol_obsfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #obs_last_out = tf.layers.batch_normalization(obs_last_out, training=is_training, name="pol_obn%i"%(i+1)) obs_last_out = tf.nn.tanh(obs_last_out) last_out = tf.concat([last_out, goal_last_out, obs_last_out], -1) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.constant_initializer( [0.2, 0.2, -1., -1.])) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob_config, ob_target, obs_pos], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0): assert isinstance(ob_space, gym.spaces.Box) # define action and observation space self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), ) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value( (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) # implement Q-function approximation last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.relu( U.dense(last_out0, hid_size, "vffc0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.relu( U.dense(last_out1, hid_size, "vffc1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) # return the Q-function value self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0] # implement parametrizatzion for policy over options last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.relu( U.dense(last_out0, hid_size, "oppi0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.relu( U.dense(last_out1, hid_size, "oppi1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) # return probabilities for the options self.op_pi = tf.nn.softmax(last_out) # always terminate self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.constant([True]) # define the control policy / intra-option policy last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.relu( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01), bias=False) # now also use relus to squash to -1,1 mean = (-tf.nn.relu(-(mean - 1)) + tf.nn.relu(-(mean + 1))) + 1 logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # sample stochastically -> this corresponds to exploration stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # choose the appropriate action, apply the ZOH if using option 0 ac = U.switch(option[0], ac, tf.stop_gradient(ob[:, -self.ac_space_dim:])) ac = tf.clip_by_value(ac, -1.0, 1.0) self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('pol'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) # with tf.variable_scope('pol'): # # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, # # -5.0, 5.0) # last_out = ob # for i in range(num_hid_layers): # last_out = tf.nn.tanh( # tf.layers.dense(last_out, hid_size, name='fc%i' % (i + 1), # kernel_initializer=U.normc_initializer( # 1.0), bias_initializer = tf.constant_initializer(0.1))) # mean = tf.layers.dense(last_out, pdtype.param_shape()[0] // 2, # name='final', # kernel_initializer=U.normc_initializer( # 0.01)) # logstd = tf.get_variable(name="logstd", shape=[1, # pdtype.param_shape()[ # 0] // 2], # initializer=tf.zeros_initializer()) # # out_std = tf.exp(0.5*logstd + 0.0) # # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) # import numpy as np # pdparam = tf.concat([mean, mean * 0.0 + np.random.randn(pdtype.param_shape()[0] // 2) * logstd], axis=1) # # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): # # mean = tf.layers.dense(last_out, pdtype.param_shape()[0] // 2, # # name='final', # # kernel_initializer=U.normc_initializer( # # 0.01)) # # logstd = tf.get_variable(name="logstd", shape=[1, # # pdtype.param_shape()[ # # 0] // 2], # # initializer=tf.zeros_initializer()) # # out_std = tf.exp(0.5*logstd + 0.0) # # # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) # # import numpy as np # # pdparam = tf.concat([mean, np.random.randn(pdtype.param_shape()[0] // 2) * out_std], axis=1) # # # else: # # pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], # # name='final', # # kernel_initializer=U.normc_initializer( # # 0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], ac)
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, scope="policy"): if layernormalize: print( "Warning: policy is operating on top of layer-normed features. It might slow down the training." ) self.layernormalize = layernormalize self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype(ac_space) self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(None, None) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac') self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope pdparamsize = self.ac_pdtype.param_shape()[0] print('ob_mean shape: ', ob_mean.shape) sh = tf.shape(self.ph_ob) x = flatten_two_dims(self.ph_ob) x = tf.cast(x, dtype=tf.float32) l = [] for i in range(4): r = tf.multiply(x[:, :, :, i * 3], 0.299) g = tf.multiply(x[:, :, :, i * 3 + 1], 0.587) b = tf.multiply(x[:, :, :, i * 3 + 2], 0.114) gray = r + g + b l.append(gray) x = tf.stack(l, axis=-1) x = tf.cast(x, dtype=tf.int32) l = [] for i in range(4): r = ob_mean[:, :, i * 3] * 0.299 g = ob_mean[:, :, i * 3 + 1] * 0.587 b = ob_mean[:, :, i * 3 + 2] * 0.114 gray = r + g + b l.append(gray) print('before obmean: ', self.ob_mean.shape) self.ob_mean = np.stack(l, axis=-1) self.ob_rgb_mean = ob_mean print('after obmean: ', self.ob_mean.shape) self.flat_features = self.get_features(x, reuse=False) self.features = unflatten_first_dim(self.flat_features, sh) with tf.variable_scope(scope, reuse=False): x = fc(self.flat_features, units=hidsize, activation=activ) x = fc(x, units=hidsize, activation=activ) pdparam = fc(x, name='pd', units=pdparamsize, activation=None) vpred = fc(x, name='value_function_output', units=1, activation=None) pdparam = unflatten_first_dim(pdparam, sh) self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0] self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp)
def _init(self, ob_space, ac_space,hid_size_V, hid_size_actor, num_hid_layers,V_keep_prob,\ mc_samples,layer_norm,activation_critic,activation_actor, dropout_on_V,gaussian_fixed_var=True, sample_dropout=False): assert isinstance(ob_space, gym.spaces.Box) self.sample_dropout = sample_dropout self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz self.mc_samples=mc_samples self.V_keep_prob=V_keep_prob ### MAIN CHANGES ####################### # Value function with tf.variable_scope("value_function"): dropout_networks = [last_out] * self.mc_samples # dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.V_keep_prob) for i in range(num_hid_layers): if layer_norm: last_out = activation_critic(tc.layers.layer_norm(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \ kernel_initializer=U.normc_initializer(1.0)), center=True,scope="vffc_activation%i"%(i+1) ,scale=True)) apply_layer = lambda x : activation_critic(tc.layers.layer_norm(tf.layers.dense(x, hid_size_V,name="vffc%i"%(i+1), reuse=True) ,center=True,scope="vffc_activation%i"%(i+1) ,scale=True,reuse=True) ) else: last_out = activation_critic(tf.layers.dense(last_out, hid_size_V, name="vffc%i"%(i+1), \ kernel_initializer=U.normc_initializer(1.0))) apply_layer = lambda x : activation_critic(tf.layers.dense(x, hid_size_V,name="vffc%i"%(i+1), reuse=True)) dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.V_keep_prob) ## final layer self.vpred = tf.layers.dense(last_out, 1, name="vffinal", kernel_initializer=U.normc_initializer(1.0))[:,0] apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \ name="vffinal", reuse=True)[:,0] dropout_networks=generate_layer(apply_layer,dropout_networks,self.V_keep_prob) mean,variance=tf.nn.moments(tf.stack(dropout_networks), 0) self.vpred_mc_mean=tf.add_n(dropout_networks) / float(len(dropout_networks)) self.vpred_dropout_networks=dropout_networks self.variance=variance LAMBDA = tf.placeholder(dtype=tf.float32, shape=()) self.v_lambda_variance=self.vpred_mc_mean+LAMBDA*tf.sqrt(variance) ####################### ## Policy last_out = obz with tf.variable_scope("policy"): for i in range(num_hid_layers): last_out = U.dense(last_out, hid_size_actor, "polfc%i"%(i+1), \ weight_init=U.normc_initializer(1.0)) last_out = activation_actor(last_out) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) last_out = obz ## BUilding function Q(s,a) # last_out2=self.pd.sample() # activation=tf.nn.relu # ####################### # # Action Value function # with tf.variable_scope("Q"): # dropout_networks = [last_out] * self.mc_samples # dropout_networks = generate_dropout_layer(lambda x: x, dropout_networks, self.keep_prob) # # ## concatenate state and action # last_out = tf.concat([last_out, last_out2], axis=-1) # # new_networks = [] # for dropout_network in dropout_networks: # dropout_network = tf.concat([dropout_network, last_out2], axis=-1) # dropout_network, mask = U.bayes_dropout(dropout_network, self.keep_prob) # new_networks.append(dropout_network) # dropout_networks = new_networks # # ### hidden layers # for i in range(num_hid_layers): # # last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="Q%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) # apply_layer = lambda x : activation(tf.layers.dense(x, hid_size, activation=None, \ # name="Q%i"%(i+1), reuse=True)) # dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.keep_prob) # # ## final layer # self.qpred = tf.layers.dense(last_out, 1, name="Qfinal", kernel_initializer=U.normc_initializer(1.0))[:,0] # # apply_layer = lambda x : tf.layers.dense(x, 1, activation=None, \ # name="Qfinal", reuse=True)[:,0] # dropout_networks=generate_dropout_layer(apply_layer,dropout_networks,self.keep_prob) # # self.qpred_mc_mean=tf.add_n(dropout_networks) / float(len(dropout_networks)) # self.qpred_dropout_networks=dropout_networks ### MAIN CHANGES ## if dropout: if dropout_on_V: if self.sample_dropout: self._act = [U.function([stochastic, ob], [ac, x]) for x in dropout_networks] else: self._act = U.function([stochastic, ob], [ac, self.vpred_mc_mean]) else: self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, init_std=1.0, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None self.varphi_dim = hid_size self.ob = utils.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) # self.ob = tf.placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=utils.normc_initializer(1.0))) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=utils.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz # Create 'num_hid_layers' hidden layers for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=utils.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): self.action_dim = ac_space.shape[0] # mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) # logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) self.dist_diagonal = True self.varphi = last_out self.varphi_dim = hid_size if self.dist_diagonal: stddev_init = np.ones([1, self.action_dim]) * init_std prec_init = 1. / (np.multiply(stddev_init, stddev_init)) # 1 x |a| self.prec = tf.get_variable(name="prec", shape=[1, self.action_dim], initializer=tf.constant_initializer(prec_init)) kt_init = np.ones([self.varphi_dim, self.action_dim]) * 0.5 / self.varphi_dim ktprec_init = kt_init * prec_init self.ktprec = tf.get_variable(name="ktprec", shape=[self.varphi_dim, self.action_dim], initializer=tf.constant_initializer(ktprec_init)) kt = tf.divide(self.ktprec, self.prec) mean = tf.matmul(last_out, kt) logstd = tf.log(tf.sqrt(1. / self.prec)) else: # Not implemented yet raise NotImplementedError self.prec_get_flat = utils.GetFlat([self.prec]) self.prec_set_from_flat = utils.SetFromFlat([self.prec]) self.ktprec_get_flat = utils.GetFlat([self.ktprec]) self.ktprec_set_from_flat = utils.SetFromFlat([self.ktprec]) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=utils.normc_initializer(0.01)) self.scope = tf.get_variable_scope().name self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = utils.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = utils.function([stochastic, self.ob], [ac, self.vpred]) # Get all policy parameters vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope + '/pol') # Remove log-linear parameters ktprec and prec to get only non-linear parameters del vars[-1] del vars[-1] beta_params = vars # Flat w_beta beta_len = np.sum([np.prod(p.get_shape().as_list()) for p in beta_params]) w_beta_var = tf.placeholder(dtype=tf.float32, shape=[beta_len]) # Unflatten w_beta beta_shapes = list(map(tf.shape, beta_params)) w_beta_unflat_var = self.unflatten_tensor_variables(w_beta_var, beta_shapes) # w_beta^T * \grad_beta \varphi(s)^T v = tf.placeholder(dtype=self.varphi.dtype, shape=self.varphi.get_shape(), name="v_in_Rop") features_beta = self.alternative_Rop(self.varphi, beta_params, w_beta_unflat_var, v) self.features_beta = utils.function([self.ob, w_beta_var, v], features_beta)
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=True): #pylint: disable=W0613 ob_shape = (nbatch, ) + ob_space.shape actdim = ac_space.shape[0] window_length = ob_space.shape[1] - 1 X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs # with tf.variable_scope("model", reuse=reuse) as scope: # # policy # w0 = tf.slice(X, [0,0,0,0],[-1,-1,1,1], name='pi_sl0') # x = tf.slice(X, [0,0,1,0],[-1,-1,-1,-1], name='pi_sl1') # x = conv(tf.cast(x, tf.float32),'c1', fh=1,fw=4,nf=3, stride=1, init_scale=np.sqrt(2)) # # x = tf.layers.conv2d( # # inputs=x, # # filters=3, # # kernel_size=[1, 4], # # padding="valid", # # activation=tf.nn.relu) # #(1, 3, 47, 3) # x = conv(x, 'c2', fh=1, fw=window_length -3, nf=20, stride= window_length -3, init_scale=np.sqrt(2)) # # x = tf.layers.conv2d( # # inputs=x, # # filters=20, # # kernel_size=[1, window_length -3], # # padding="valid", # # strides=(1, window_length -3), # # activation=tf.nn.relu) # x = tf.concat([x, w0], 3) # x = conv(x, 'c3', fh=1, fw=1, nf=1, stride= 1, init_scale=np.sqrt(2)) # # x = tf.layers.conv2d( # # inputs=x, # # filters=1, # # kernel_size=[1, 1], # # padding="valid", # # strides=(1, 1), # # activation=tf.nn.relu) # cash_bias = tf.zeros([x.shape[0],1,1,1], tf.float32) # c = tf.concat([cash_bias, x], 1) # v = conv_to_fc(x) # # vf = fc(v, 'v',1)[:,0] # f = tf.contrib.layers.flatten(c) # eps = 10e20 # f = tf.clip_by_value(f, -eps, eps, 'clip1') # # f = tf.Print(f, [f], "concatenate") # pi = tf.nn.softmax(f) # # pi = tf.Print(pi,[pi], 'pi ') # # f = tf.nn.relu(f) # vf = fc(v, 'v',1, act=tf.nn.relu)[:,0] # # vf = tf.add(tf.ones(v.shape), v) # # vf = fc(v, 'v',1)[:,0] # # vf = tf.add(vf, tf.ones(vf.shape, tf.float32)) # logstd = tf.get_variable(name="logstd", shape=[1, actdim], # initializer=tf.zeros_initializer()) # eps = 80 # logstd = tf.clip_by_value(logstd, -eps, eps, 'clip_logstd') # # logstd = tf.Print(logstd,[logstd], 'logstd ') with tf.variable_scope("model", reuse=reuse) as scope: w0 = tf.slice(X, [0, 0, 0, 0], [-1, -1, 1, 1]) x = tf.slice(X, [0, 0, 1, 0], [-1, -1, -1, -1]) # reuse when testing x = conv(tf.cast(x, tf.float32), 'c1', fh=1, fw=3, nf=3, stride=1, init_scale=np.sqrt(2)) x = conv(x, 'c2', fh=1, fw=window_length - 2, nf=20, stride=window_length - 2, init_scale=np.sqrt(2)) x = tf.concat([x, w0], 3) x = conv(x, 'c3', fh=1, fw=1, nf=1, stride=1, init_scale=np.sqrt(2)) cash_bias = tf.ones([x.shape[0], 1, 1, 1], tf.float32) c = tf.concat([cash_bias, x], 1) v = conv_to_fc(x) vf = fc(v, 'v', 1)[:, 0] f = tf.contrib.layers.flatten(c) pi = tf.nn.softmax(f) logstd = tf.get_variable( name="logstd", shape=[1, actdim], initializer=tf.truncated_normal_initializer()) # logstd = tf.Print(logstd,[logstd], 'logstd ') eps = 50 # logstd = tf.clip_by_value(logstd, -eps, eps, 'clip_logstd') pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) a0 = self.pd.sample() # a0 = tf.clip_by_value(a0, -eps, eps, 'clip2') a0 = tf.nn.softmax(a0) neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp, lst, p = sess.run([a0, vf, neglogp0, logstd, pi], {X: ob}) # print ("logstd: "+ str(lst[0])) # print ("action: " + str(a)) # print ("value: {}".format(v)) # print ("neglogp: "+ str(neglogp)) # print ("f:{}".format(f)) return a, v, self.initial_state, neglogp, lst[0], p def value(ob, *_args, **_kwargs): v = sess.run(vf, {X: ob}) # print ("vf: " + str(v)) return v self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, action_space, nbatch, nsteps, reuse = False): # This will use to initialize our kernels gain = np.sqrt(2) # Based on the action space, will select what probability distribution type # we will use to distribute action in our stochastic policy (in our case DiagGaussianPdType # aka Diagonal Gaussian, 3D normal distribution self.pdtype = make_pdtype(action_space) height, weight, channel = ob_space.shape ob_shape = (height, weight, channel) # Create the input placeholder inputs_ = tf.placeholder(tf.float32, [None, *ob_shape], name="input") # Normalize the images scaled_images = tf.cast(inputs_, tf.float32) / 255. """ Build the model 3 CNN for spatial dependencies Temporal dependencies is handle by stacking frames (Something funny nobody use LSTM in OpenAI Retro contest) 1 common FC 1 FC for policy 1 FC for value """ with tf.variable_scope("model", reuse = reuse): conv1 = conv_layer(scaled_images, 32, 8, 4, gain) conv2 = conv_layer(conv1, 64, 4, 2, gain) conv3 = conv_layer(conv2, 64, 3, 1, gain) flatten1 = tf.layers.flatten(conv3) fc_common = fc_layer(flatten1, 512, gain=gain) # This build a fc connected layer that returns a probability distribution # over actions (self.pd) and our pi logits (self.pi). self.pd, self.pi = self.pdtype.pdfromlatent(fc_common, init_scale=0.01) # Calculate the v(s) vf = fc_layer(fc_common, 1, activation_fn=None)[:, 0] self.initial_state = None # Take an action in the action distribution (remember we are in a situation # of stochastic policy so we don't always take the action with the highest probability # for instance if we have 2 actions 0.7 and 0.3 we have 30% chance to take the second) a0 = self.pd.sample() # Function use to take a step returns action to take and V(s) def step(state_in, *_args, **_kwargs): action, value = sess.run([a0, vf], {inputs_: state_in}) #print("step", action) return action, value # Function that calculates only the V(s) def value(state_in, *_args, **_kwargs): return sess.run(vf, {inputs_: state_in}) # Function that output only the action to take def select_action(state_in, *_args, **_kwargs): return sess.run(a0, {inputs_: state_in}) self.inputs_ = inputs_ self.vf = vf self.step = step self.value = value self.select_action = select_action
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 ob_shape = (nbatch, ) + ob_space.shape # Discrete actdim = ac_space.n with tf.compat.v1.variable_scope('policy', reuse=reuse): X = tf.compat.v1.placeholder(tf.float32, ob_shape, name='Ob') #obs activ = tf.tanh # logstd = tf.Variable(name="logstd", shape=[1, actdim], initial_value=tf.zeros([1, actdim])) h1 = activ(fc(X, 'v_mix_fc1', nh=64, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'v_mix_fc2', nh=64, init_scale=np.sqrt(2))) v_mix0 = fc(h2, 'v_mix', 1)[:, 0] h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) pi = fc(h2, 'pi', actdim, init_scale=0.01) with tf.compat.v1.variable_scope('intrinsic', reuse=reuse): X_ALL = tf.compat.v1.placeholder(tf.float32, (None, ) + ob_space.shape, name='Ob_all') #obs A_ALL = tf.compat.v1.placeholder(tf.float32, [None, actdim], name='Ac_all') #obs INPUT = tf.concat([X_ALL, A_ALL], axis=1) activ = tf.tanh h1 = activ(fc(INPUT, 'intrinsic_fc1', nh=64, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'intrinsic_fc2', nh=64, init_scale=np.sqrt(2))) r_in0 = tf.tanh(fc(h2, 'r_in', 1))[:, 0] h1 = activ(fc(X, 'v_ex_fc1', nh=64, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'v_ex_fc2', nh=64, init_scale=np.sqrt(2))) v_ex0 = fc(h2, 'v_ex', 1)[:, 0] self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v_ex, v_mix, neglogp = sess.run([a0, v_ex0, v_mix0, neglogp0], {X: ob}) return a, v_ex, v_mix, self.initial_state, neglogp def value(ob, *_args, **_kwargs): v_ex, v_mix = sess.run([v_ex0, v_mix0], {X: ob}) return v_ex, v_mix def intrinsic_reward(ob, ac, *_args, **_kwargs): r_in = sess.run(r_in0, {X_ALL: ob, A_ALL: ac}) return r_in self.X = X self.X_ALL = X_ALL self.A_ALL = A_ALL self.pi = pi self.v_ex = v_ex0 self.r_in = r_in0 self.v_mix = v_mix0 self.step = step self.value = value self.intrinsic_reward = intrinsic_reward self.policy_params = tf.compat.v1.trainable_variables("policy") self.intrinsic_params = tf.compat.v1.trainable_variables("intrinsic") self.policy_new_fn = MlpPolicyNew
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, max_grad_norm, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) self.rep_loss = None # explicitly create vector space for latent vectors latent_space = Box(-np.inf, np.inf, shape=(256, )) # So that I can compute the saliency map if Config.REPLAY: X = tf.compat.v1.placeholder(shape=(nbatch, ) + ob_space.shape, dtype=np.float32, name='Ob') processed_x = X else: X, processed_x = observation_input(ob_space, None) TRAIN_NUM_STEPS = Config.NUM_STEPS // 16 REP_PROC = tf.compat.v1.placeholder(dtype=tf.float32, shape=(None, 64, 64, 3), name='Rep_Proc') Z_INT = tf.compat.v1.placeholder(dtype=tf.int32, shape=(), name='Curr_Skill_idx') Z = tf.compat.v1.placeholder(dtype=tf.float32, shape=(nbatch, Config.N_SKILLS), name='Curr_skill') CODES = tf.compat.v1.placeholder(dtype=tf.float32, shape=(1024, Config.N_SKILLS), name='Train_Codes') CLUSTER_DIMS = 256 HIDDEN_DIMS_SSL = 256 STEP_BOOL = tf.placeholder(tf.bool, shape=[]) self.protos = tf.compat.v1.Variable( initial_value=tf.random.normal(shape=(CLUSTER_DIMS, Config.N_SKILLS)), trainable=True, name='Prototypes') self.A = self.pdtype.sample_placeholder([None], name='A') self.R = tf.compat.v1.placeholder(tf.float32, [None], name='R') # trajectories of length m, for N policy heads. self.STATE = tf.compat.v1.placeholder(tf.float32, [None, 64, 64, 3]) self.STATE_NCE = tf.compat.v1.placeholder( tf.float32, [Config.REP_LOSS_M, 1, None, 64, 64, 3]) self.ANCH_NCE = tf.compat.v1.placeholder(tf.float32, [None, 64, 64, 3]) # labels of Q value quantile bins self.LAB_NCE = tf.compat.v1.placeholder( tf.float32, [Config.POLICY_NHEADS, None]) self.A_i = self.pdtype.sample_placeholder( [None, Config.REP_LOSS_M, 1], name='A_i') self.R_cluster = tf.compat.v1.placeholder(tf.float32, [None]) self.A_cluster = self.pdtype.sample_placeholder( [None, Config.NUM_ENVS], name='A_cluster') with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): act_condit, act_invariant, slow_dropout_assign_ops, fast_dropout_assigned_ops = choose_cnn( processed_x) self.train_dropout_assign_ops = fast_dropout_assigned_ops self.run_dropout_assign_ops = slow_dropout_assign_ops self.h = tf.concat([act_condit, act_invariant], axis=1) """ Bisimulation code """ with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): # encoder loss act_one_hot_target = tf.reshape(tf.one_hot(self.A, ac_space.n), (-1, ac_space.n)) pred_next_latent_mu1 = get_transition_model()(tf.concat( [self.h, act_one_hot_target], axis=1)) pred_next_latent_mu2 = shuffle_custom(pred_next_latent_mu1) z_dist = tf.reduce_mean( tf.compat.v1.losses.huber_loss( self.h, shuffle_custom(self.h), reduction=tf.compat.v1.losses.Reduction.NONE), 1) r_dist = tf.compat.v1.losses.huber_loss( self.R, shuffle_custom(self.R), reduction=tf.compat.v1.losses.Reduction.NONE) transition_dist = tf.reduce_mean( tf.compat.v1.losses.huber_loss( pred_next_latent_mu1, pred_next_latent_mu2, reduction=tf.compat.v1.losses.Reduction.NONE), 1) bisimilarity = r_dist + Config.GAMMA * transition_dist self.encoder_bisimilarity_loss = tf.reduce_mean( tf.math.pow(z_dist - bisimilarity, 2)) # latent loss pred_next_latent_mu1_3d = tf.transpose( tf.reshape(pred_next_latent_mu1, [-1, Config.NUM_ENVS, 256]), (1, 0, 2)) # 32 x n_timesteps x n_hidden h_3d = tf.transpose(tf.reshape(self.h, [-1, Config.NUM_ENVS, 256]), (1, 0, 2)) # 32 x n_timesteps x n_hidden pred_next_latent_mu1 = pred_next_latent_mu1_3d[:, : -1, :] # t = 0 to n_timesteps-1 next_h = h_3d[:, 1:, :] # t = 1 to n_timesteps diff = (pred_next_latent_mu1 - tf.stop_gradient(next_h)) self.latent_transition_loss = tf.reduce_mean(0.5 * tf.math.pow(diff, 2)) with tf.compat.v1.variable_scope("online", reuse=tf.compat.v1.AUTO_REUSE): with tf.compat.v1.variable_scope("head_0", reuse=tf.compat.v1.AUTO_REUSE): self.pd_train = [ self.pdtype.pdfromlatent(tf.stop_gradient(self.h), init_scale=0.01)[0] ] self.vf_train = [fc(self.h, 'v_0', 1)[:, 0]] # Plain Dropout version: Only fast updates / stochastic latent for VIB self.pd_run = self.pd_train self.vf_run = self.vf_train # For Dropout: Always change layer, so slow layer is never used self.run_dropout_assign_ops = [] # Use the current head for classical PPO updates a0_run = [ self.pd_run[head_idx].sample() for head_idx in range(Config.POLICY_NHEADS) ] neglogp0_run = [ self.pd_run[head_idx].neglogp(a0_run[head_idx]) for head_idx in range(Config.POLICY_NHEADS) ] self.initial_state = None def step(ob, update_frac, skill_idx=None, one_hot_skill=None, nce_dict={}, *_args, **_kwargs): if Config.REPLAY: ob = ob.astype(np.float32) head_idx = 0 a, v, neglogp = sess.run([ a0_run[head_idx], self.vf_run[head_idx], neglogp0_run[head_idx] ], {X: ob}) return a, v, self.initial_state, neglogp def rep_vec(ob, *_args, **_kwargs): return sess.run(self.h, {X: ob}) def value(ob, update_frac, one_hot_skill=None, *_args, **_kwargs): if Config.AGENT == 'ppo_diayn': return sess.run(self.vf_run, {X: ob, Z: one_hot_skill}) elif Config.AGENT == 'ppo_goal': return sess.run(self.vf_run, {REP_PROC: ob, Z: one_hot_skill}) else: return sess.run(self.vf_run, {self.STATE: ob, X: ob}) def value_i(ob, update_frac, one_hot_skill=None, *_args, **_kwargs): if Config.AGENT == 'ppo_diayn': return sess.run(self.vf_i_run, {X: ob, Z: one_hot_skill}) elif Config.AGENT == 'ppo_goal': return sess.run(self.vf_i_run, { REP_PROC: ob, Z: one_hot_skill }) else: return sess.run(self.vf_i_run, {self.STATE: ob, X: ob}) def nce_fw_pass(nce_dict): return sess.run([self.vf_i_run, self.rep_loss], nce_dict) def custom_train(ob, rep_vecs): return sess.run([self.rep_loss], {X: ob, REP_PROC: rep_vecs})[0] def compute_codes(ob, act): return sess.run([ tf.reshape(self.codes, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), tf.reshape(self.u_t, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), tf.reshape(self.z_t_1, (Config.NUM_ENVS, Config.NUM_STEPS, -1)), self.h_codes[:, 1:] ], { REP_PROC: ob, self.A_cluster: act }) def compute_hard_codes(ob): return sess.run([self.codes, self.u_t, self.z_t_1], {REP_PROC: ob}) def compute_cluster_returns(returns): return sess.run([self.cluster_returns], {self.R_cluster: returns}) self.X = X self.processed_x = processed_x self.step = step self.value = value self.value_i = value_i self.rep_vec = rep_vec self.custom_train = custom_train self.nce_fw_pass = nce_fw_pass self.encoder = choose_cnn self.REP_PROC = REP_PROC self.Z = Z self.compute_codes = compute_codes self.compute_hard_codes = compute_hard_codes self.compute_cluster_returns = compute_cluster_returns self.CODES = CODES self.STEP_BOOL = STEP_BOOL
def __init__(self, env, observations, goals, latent, estimate_q=False, vf_latent=None, sess=None, **tensors): """ Parameters: ---------- env RL environment observations tensorflow placeholder in which the observations will be fed latent latent state from which policy distribution parameters should be inferred vf_latent latent state from which value function should be inferred (if None, then latent is used) sess tensorflow session to run calculations in (if None, default session is used) **tensors tensorflow tensors for additional attributes such as state or mask """ self.X = observations self.state = tf.constant([]) self.initial_state = None self.__dict__.update(tensors) vf_latent = vf_latent if vf_latent is not None else latent vf_latent = tf.layers.flatten(vf_latent) latent = tf.layers.flatten(latent) # Based on the action space, will select what probability distribution type self.pdtype = make_pdtype(env.action_space) if goals is not None: self.goals = goals addition_layers = False activ = tf.nn.tanh nh = 256 if addition_layers: latent = tf.layers.dense(latent, units=nh, activation=activ) vf_latent = tf.layers.dense(vf_latent, units=nh, activation=activ) self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01) # Take an action self.action = self.pd.sample() # Calculate the neg log of our probability self.neglogp = self.pd.neglogp(self.action) self.sess = sess or tf.get_default_session() if estimate_q: assert isinstance(env.action_space, gym.spaces.Discrete) self.q = fc(vf_latent, 'q', env.action_space.n) self.vf = self.q else: self.vf = fc(vf_latent, 'vf', 1) self.vf = self.vf[:, 0]
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={})) self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={})) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, use_bias=True, use_critic=True, seed=None, hidden_W_init=U.normc_initializer(1.0), hidden_b_init=tf.zeros_initializer(), output_W_init=U.normc_initializer(0.01), output_b_init=tf.zeros_initializer()): """Params: ob_space: task observation space ac_space : task action space hid_size: width of hidden layers num_hid_layers: depth gaussian_fixed_var: True->separate parameter for logstd, False->two-headed mlp use_bias: whether to include bias in neurons """ assert isinstance(ob_space, gym.spaces.Box) if isinstance(hid_size, list): num_hid_layers = len(hid_size) else: hid_size = [hid_size] * num_hid_layers if seed is not None: tf.set_random_seed(seed) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) #Critic if use_critic: with tf.variable_scope('vf'): obz = tf.clip_by_value( (ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense(last_out, hid_size[i], name="fc%i" % (i + 1), kernel_initializer=hidden_W_init)) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=hidden_W_init)[:, 0] #Actor with tf.variable_scope('pol'): last_out = tf.clip_by_value( (ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense(last_out, hid_size[i], name='fc%i' % (i + 1), kernel_initializer=hidden_W_init, use_bias=use_bias)) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): self.mean = mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=output_W_init, use_bias=use_bias) self.logstd = logstd = tf.get_variable( name="pol_logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=output_b_init) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=output_W_init) #Acting self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) if use_critic: self._act = U.function([stochastic, ob], [ac, self.vpred]) else: self._act = U.function([stochastic, ob], [ac, tf.zeros(1)]) #Evaluating self.ob = ob self.ac_in = U.get_placeholder(name="ac_in", dtype=ac_space.dtype, shape=[sequence_length] + list(ac_space.shape)) self.gamma = U.get_placeholder(name="gamma", dtype=tf.float32, shape=[]) self.rew = U.get_placeholder(name="rew", dtype=tf.float32, shape=[sequence_length] + [1]) self.logprobs = self.pd.logp(self.ac_in) # [\log\pi(a|s)] #Fisher with tf.variable_scope('pol') as vs: self.weights = weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \ scope=vs.name) self.flat_weights = flat_weights = tf.concat( [tf.reshape(w, [-1]) for w in weights], axis=0) self.n_weights = flat_weights.shape[0].value self.score = score = U.flatgrad(self.logprobs, weights) # \nabla\log p(\tau) self.fisher = tf.einsum('i,j->ij', score, score) #Performance graph initializations self._setting = []
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) # logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) logstd = tf.multiply( tf.ones(shape=[1, pdtype.param_shape()[0] // 2]), tf.constant(0.5 / ac_space.shape[0])) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, scope, ob_space, ac_space, ob_mean, ob_std, perception, feat_spec, policy_spec, activation, layernormalize, batchnormalize, add_noise, keep_noise, noise_std, transfer_load, num_layers, keep_dim, transfer_dim, vf_coef, coinrun): # warnings # i do not want to accidentally pass layernormalize and batchnormalize # for coinrun if layernormalize: print( "Warning: policy is operating on top of layer-normed features." ) raise NotImplementedError() if batchnormalize: print( "Warning: policy is operating on top of batch-normed features." ) raise NotImplementedError() self.transfer_load = transfer_load self.transfer_dim = transfer_dim self.num_layers = num_layers self.keep_dim = keep_dim self.coinrun = coinrun self.ob_mean = ob_mean self.ob_std = ob_std self.add_noise = add_noise self.keep_noise = keep_noise self.noise_std = noise_std self.layernormalize = layernormalize self.batchnormalize = batchnormalize self.vf_coef = vf_coef input_shape = ob_space.shape # perception module self.perception = perception # feature dimensions (HARD-CODED NOT GOOD) self.feat_dim = 512 # policy module self.feat_spec = feat_spec self.policy_spec = policy_spec self.activation = activation with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype(ac_space) # placeholders dtype = ob_space.dtype if dtype == np.int8: dtype = np.uint8 print('policy.py, class Policy, def __init__, dtype: {}'.format( dtype)) # taken from baselines.common.input import observation_input self.ph_ob = tf.to_float( tf.placeholder(dtype=ob_space.dtype, shape=(None, ) + ob_space.shape, name='ob')) self.ph_ac = self.ac_pdtype.sample_placeholder([None], name='ac') self.pd = self.vpred = None self.scope = scope self.pdparamsize = self.ac_pdtype.param_shape()[0] with tf.variable_scope(self.scope + '_representation', reuse=False): self.unflattened_out = self.get_out(self.ph_ob, reuse=False) out = utils.flatten(self.unflattened_out) print( 'policy.py, class Policy, def __init__, self.out.shape: {}' .format(out.shape)) # we get features (feat_dim 512) self.features = self.get_features(out, reuse=False) pdparam, self.vpred = self.get_policy(self.features, reuse=False) self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp) self.logits = pdparam print( 'policy.py, class Policy, def __init__, pdparam.shape: {}, pdparam.dtype: {}' .format(pdparam.shape, pdparam.dtype)) print( 'policy.py, class Policy, def __init__, self.vpred: {}'.format( self.vpred.shape)) print('policy.py, class Policy, def __init__, self.a_samp: {}'. format(self.a_samp.shape)) print( 'policy.py, class Policy, def __init__, self.entropy.shape: {}' .format(self.entropy.shape)) print( 'policy.py, class Policy, def __init__, self.nlp_samp.shape: {}' .format(self.nlp_samp.shape)) print( 'policy.py, class Policy, def __init__, self.logits.shape: {}'. format(self.logits.shape))
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, K=32, reuse=False, M=None): #pylint: disable=W0613 assert M is not None ob_shape = (nbatch, ) + ob_space.shape actdim = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs act = tf.tanh with tf.variable_scope("model", reuse=reuse): h1 = act(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) h2 = act(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) pi = fc(h2, 'pi', actdim, init_scale=0.01) h1 = act(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) h2 = act(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(h2, 'vf', K) #[:,0] logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) # reparameterize actions noise = tf.random_normal([nbatch, M, actdim]) mu = tf.expand_dims(pi, axis=1) std = tf.expand_dims(tf.exp(pi * 0.0 + logstd), axis=1) a_reparameterized = mu + std * noise # sample actions pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None # distributional info self.K = K vf_mean = tf.reduce_mean(vf, axis=-1) def step(ob, *_args, **_kwargs): a, v, neglogp, batchactions, v_avg = sess.run( [a0, vf, neglogp0, a_reparameterized, vf_mean], {X: ob}) return a, v, self.initial_state, neglogp, batchactions, v_avg def value(ob, *_args, **_kwargs): return sess.run(vf_mean, {X: ob}) self.a0 = a0 self.X = X self.pi = pi self.vf = vf self.vf_mean = vf_mean self.step = step self.value = value self.a_reparameterized = a_reparameterized
def _init(self, ob_space, ac_space, hid_dims_p, hid_dims_v, train=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) self.ob_space = ob_space self.ac_space = ac_space self.hid_dims_p = hid_dims_p self.hid_dims_v = hid_dims_v # # with tf.variable_scope('rms'): # # self.ob_rms = RunningMeanStd(dtype=tf.float32, shape=ob_space.shape) # with tf.variable_scope('cnn'): # self.ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) # # self.obz = tf.clip_by_value((self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # self.x = self.ob/255.0 # # self.x = tf.nn.relu(U.conv2d(self.x, 32, "l1", [8, 8], [4, 4], pad="VALID")) # self.x = tf.nn.relu(U.conv2d(self.x, 64, "l2", [4, 4], [2, 2], pad="VALID")) # self.x = tf.nn.relu(U.conv2d(self.x, 64, "l3", [3, 3], [1, 1], pad="VALID")) # self.x = U.flattenallbut0(self.x) # self.x = tf.nn.relu(tf.layers.dense(self.x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) # # odim = self.x.shape[-1] # odim = int(odim) # # print(self.ac_space.shape) # adim = pdtype.param_shape()[0] with tf.variable_scope("cnn"): self.ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) # self.obz = tf.clip_by_value((self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) self.x = self.ob / 255.0 W_conv1 = tf.Variable( tf.truncated_normal([5, 5, 4, 32], stddev=0.1)) b_conv1 = tf.Variable(tf.constant(0.1, shape=[32])) h_conv1 = tf.nn.relu( tf.nn.conv2d( self.x, W_conv1, strides=[1, 1, 1, 1], padding="SAME") + b_conv1) h_pool1 = tf.nn.max_pool(h_conv1, ksize=[1, 4, 4, 1], strides=[1, 4, 4, 1], padding='SAME') W_conv2 = tf.Variable( tf.truncated_normal([5, 5, 32, 64], stddev=0.1)) b_conv2 = tf.Variable(tf.constant(0.1, shape=[64])) h_conv2 = tf.nn.relu( tf.nn.conv2d( h_pool1, W_conv2, strides=[1, 1, 1, 1], padding="SAME") + b_conv2) h_pool2 = tf.nn.max_pool(h_conv2, ksize=[1, 4, 4, 1], strides=[1, 4, 4, 1], padding='SAME') ## 全连接层,隐含层的节点个数为 W_fc1 = tf.Variable( tf.truncated_normal([8 * 8 * 64, 1024], stddev=0.1)) b_fc1 = tf.Variable(tf.constant(0.1, shape=[1024])) h_pool3_flat = tf.reshape(h_pool2, [-1, 8 * 8 * 64]) # 将2D图像变成1D数据[n_samples,64,64,64]->>[n_samples,64*64] h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, W_fc1) + b_fc1) # 非线性激活函数 # h_fc1 = tf.matmul(h_pool3_flat, W_fc1) + b_fc1 # 非线性激活函数 h_fc1_drop = tf.nn.dropout(h_fc1, 0.5) # 防止过拟合 self.x = tf.nn.relu( tf.layers.dense(h_fc1_drop, 512, name="polfc", kernel_initializer=U.normc_initializer(1.0))) odim = self.x.shape[-1] odim = int(odim) # print(self.ac_space.shape) adim = pdtype.param_shape()[0] with tf.variable_scope('pol'): self._policy_nn(odim, adim, train) with tf.variable_scope('vf'): self._vf_nn(odim, adim, train)
def __init__(self, tf_session, ob_space, ac_space, nbatch, reward_redistribution_config, observation_network_config, lstm_network_config, training_config, exploration_config, nsteps, nlstm=64, reuse=False): """LSTM policy network, as described in RUDDER paper Based on baselines.ppo2.policies.py; LSTM layer sees features from it's own trainable observation network and the features from the reward redistribution observation network; Parameters ------- tf_session : tensorflow session tensorflow session to compute the graph in ob_space Baselines ob_space object (see ppo2_rudder.py); must provide .shape attribute for (x, y, c) shapes; ac_space Baselines ac_space object (see ppo2_rudder.py); must provide .n attribute for number of possible actions; nbatch : int Batchsize nsteps : int Fixed number of timesteps to process at once reward_redistribution_config : dict Dictionary containing config for reward redistribution: ----- lambda_eligibility_trace : float Eligibility trace value for redistributed reward vf_contrib : float Weighting of original value function (vf) vs. redistributed reward (rr), s.t. :math:`reward = vf \cdot vf\_contrib + rr \cdot (1-vf\_contrib)` use_reward_redistribution_quality_threshold : float Quality of reward redistribution has to exceed use_reward_redistribution_quality_threshold to be used; use_reward_redistribution_quality_threshold range is [0,1]; Quality measure is the squared prediction error, as described in RUDDER paper; use_reward_redistribution : bool Use reward redistribution? rr_junksize : int Junksize for reward redistribution; Junks overlap by 1 half each cont_pred_w : float Weighting of continous prediciton loss vs. prediction loss of final return at last timestep intgrd_steps : int Stepsize for integrated gradients intgrd_batchsize : int Integrated gradients is computed batch-wise if intgrd_batchsize > 1 observation_network_config : dict Dictionary containing config for observation network that processes observations and feeds them to LSTM network: ----- show_states : bool Show frames to network? show_statedeltas : bool Show frame deltas to network? prepoc_states : list of dicts Network config to preprocess frames prepoc_deltas : list of dicts Network config to preprocess frame deltas prepoc_observations : list of dicts Network config to preprocess features from frame and frame-delta preprocessing networks lstm_network_config : dict Dictionary containing config for LSTM network: ----- show_actions : bool Show taken actions to LSTM? reversed : bool Process game sequence in reversed order? layers : list of dicts Network config for LSTM network and optional additional dense layers initializations : dict Initialization config for LSTM network timestep_encoding : dict Set "max_value" and "triangle_span" for TeLL.utiltiy.misc_tensorflow.TriangularValueEncoding class training_config : dict Dictionary containing config for training and update procedure: ----- n_no_rr_updates : int Number of updates to perform without training or using reward redistribution network n_pretrain_games : int Number of games to pretrain the reward redistribution network without using it; downscale_lr_policylag : bool Downscale learningrate permanently if policy lag gets too large? optimizer : tf.train optimizer Optimizer in tf.train, e.g. "AdamOptimizer" optimizer_params : dict Kwargs for optimizer l1 : float Weighting for l1 weight regularization l2 : float Weighting for l2 weight regularization clip_gradients : float Threshold for clipping gradients (clipping by norm) exploration_config : dict Dictionary containing config for exploration: ----- sample_actions_from_softmax : bool True: Apply softmax to policy network output and use it as probabilities to pick an action False: Use the max. policy network output as action temporal_safe_exploration : bool User RUDDER safe exploration save_pi_threshold : float Threshold value in range [0,1] for safe actions in RUDDER safe exploration nlstm : int Number of LSTM units (=memory cells) reuse : bool Reuse tensorflow variables? """ # # Shapes # nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) seq_ob_shape = (nenv, -1, nh, nw, 1) nact = ac_space.n # # Placeholders for inputs # X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states # # Prepare input # single_frames = tf.cast(tf.reshape(X[..., -1:], shape=seq_ob_shape), dtype=tf.float32) delta_frames = single_frames - tf.cast(tf.reshape(X[..., -2:-1], shape=seq_ob_shape), dtype=tf.float32) # # Get observation features from RR model # rr_model = RewardRedistributionModel(reward_redistribution_config=reward_redistribution_config, observation_network_config=observation_network_config, lstm_network_config=lstm_network_config, training_config=training_config, scopename="RR") self.rr_observation_model = rr_model rr_observation_layer = rr_model.get_visual_features(single_frame=single_frames, delta_frame=delta_frames, additional_inputs=[]) # # Build policy network # with tf.variable_scope("model", reuse=reuse): temperature = tf.get_variable(initializer=tf.constant(1, dtype=tf.float32), trainable=False, name='temperature') additional_inputs = [StopGradientLayer(rr_observation_layer)] observation_layers, observation_features = observation_network( single_frame=single_frames, delta_frame=delta_frames, additional_inputs=additional_inputs, observation_network_config=observation_network_config) self.observation_features_shape = observation_features.get_output_shape() xs = [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=tf.reshape(observation_layers[-1].get_output(), [nenv, nsteps, -1]))] ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) h6 = h5 pi = fc(h6, 'pi', nact) vf = fc(h6, 'v', 1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) if exploration_config['sample_actions_from_softmax']: a0 = self.pd.sample_temp(temperature=temperature) else: a0 = tf.argmax(pi, axis=-1) v0 = vf[:, 0] neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): a, v, s, neglogp = tf_session.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) return a, v, s, neglogp def value(ob, state, mask): return tf_session.run(v0, {X:ob, S:state, M:mask}) def action(ob, state, mask, *_args, **_kwargs): a, s, neglogp = tf_session.run([a0, snew, neglogp0], {X:ob, S:state, M:mask}) return a, s, neglogp # # Placeholders for exploration # n_envs = pi.shape.as_list()[0] exploration_timesteps_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,)) prev_actions_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,)) gamelengths_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,)) keep_prev_action_pl = tf.placeholder(dtype=tf.bool, shape=(n_envs,)) prev_action_count_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,)) exploration_durations_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,)) # # Setting up safe exploration # explore = tf.logical_and(tf.logical_and(tf.less_equal(exploration_timesteps_pl, gamelengths_pl), tf.less_equal(gamelengths_pl, exploration_timesteps_pl + exploration_durations_pl)), tf.not_equal(exploration_timesteps_pl, tf.constant(-1, dtype=tf.float32))) safe_pi = pi - tf.reduce_min(pi, axis=-1, keep_dims=True) safe_pi /= tf.reduce_max(safe_pi, axis=-1, keep_dims=True) save_pi_thresholds = (1 - (tf.expand_dims(tf.range(n_envs, dtype=tf.float32), axis=1) / (n_envs + (n_envs == 1) - 1)) * (1 - exploration_config['save_pi_threshold'])) safe_pi = tf.cast(tf.greater_equal(safe_pi, save_pi_thresholds), dtype=tf.float32) safe_pi /= tf.reduce_sum(safe_pi) rand_safe_a = tf.multinomial(safe_pi, 1)[:, 0] safe_pi_flat = tf.reshape(safe_pi, (-1,)) prev_action_is_safe = tf.gather(safe_pi_flat, prev_actions_pl + tf.range(safe_pi.shape.as_list()[0], dtype=tf.int64) * safe_pi.shape.as_list()[1]) prev_action_is_safe = tf.greater(prev_action_is_safe, tf.constant(0, dtype=tf.float32)) a_explore = tf.where(tf.logical_and(tf.logical_and(keep_prev_action_pl, tf.not_equal(gamelengths_pl, exploration_timesteps_pl)), prev_action_is_safe), prev_actions_pl, rand_safe_a) a_explore = tf.where(explore, a_explore, a0) # Make sure the actor doesn't repeat an action too often (otherwise screensaver might start) rand_a = tf.random_uniform(shape=a0.get_shape(), minval=0, maxval=ac_space.n, dtype=a0.dtype) a_explore = tf.where(tf.greater(prev_action_count_pl, tf.constant(20, dtype=tf.int64)), rand_a, a_explore) if not exploration_config['temporal_safe_exploration']: a_explore = a0 neglogp_explore = self.pd.neglogp(a_explore) def action_exploration(ob, state, mask, *_args, exploration_timesteps, prev_actions, gamelengths, keep_prev_action, prev_action_count, exploration_durations, **_kwargs): """Get actions with exploration for long-term reward""" a, s, neglogp = tf_session.run([a_explore, snew, neglogp_explore], {X: ob, S:state, M:mask, exploration_timesteps_pl: exploration_timesteps, prev_actions_pl: prev_actions, gamelengths_pl: gamelengths, exploration_durations_pl: exploration_durations, keep_prev_action_pl: keep_prev_action, prev_action_count_pl: prev_action_count}) return a, s, neglogp self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value self.action = action self.action_exploration = action_exploration self.seq_ob_shape = seq_ob_shape self.exploration_config = exploration_config
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=False, popart=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("popart"): self.v_rms = RunningMeanStd(shape=[1]) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.norm_vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] if popart: self.vpred = denormalize(self.norm_vpred, self.v_rms) else: self.vpred = self.norm_vpred last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.mean_and_logstd = U.function([ob], [self.pd.mean, self.pd.logstd]) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred]) self.use_popart = popart if popart: self.init_popart() ret = tf.placeholder(tf.float32, [None]) vferr = tf.reduce_mean(tf.square(self.vpred - ret)) self.vlossandgrad = U.function([ob, ret], U.flatgrad(vferr, self.get_vf_variable()))
def _mlpPolicy(hiddens, ob, ob_space, ac_space, scope, gaussian_fixed_var=True, reuse=False): assert isinstance(ob_space, gym.spaces.Box) with tf.variable_scope(scope, reuse=reuse): pdtype = pdtype = make_pdtype(ac_space) sequence_length = None with tf.variable_scope("obfilter"): ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - ob_rms.mean) / ob_rms.std, -5.0, 5.0) last_out = obz #for i in range(num_hid_layers): for (i, hidden) in zip(range(len(hiddens)), hiddens): last_out = tf.nn.tanh( tf.layers.dense( last_out, hidden, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz #for i in range(num_hid_layers): #for hidden in hiddens: for (i, hidden) in zip(range(len(hiddens)), hiddens): last_out = tf.nn.tanh( tf.layers.dense( last_out, hidden, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) pd = pdtype.pdfromflat(pdparam) stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, pd.sample(), pd.mode()) _act = U.function([stochastic, ob], [ac, vpred]) return pd.logits, _act
def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors): """ Parameters: ---------- env RL environment observations tensorflow placeholder in which the observations will be fed latent latent state from which policy distribution parameters should be inferred vf_latent latent state from which value function should be inferred (if None, then latent is used) sess tensorflow session to run calculations in (if None, default session is used) **tensors tensorflow tensors for additional attributes such as state or mask """ self.X = observations self.state = tf.constant([]) self.initial_state = None self.__dict__.update(tensors) vf_latent = vf_latent if vf_latent is not None else latent vf_latent = tf.layers.flatten(vf_latent) latent = tf.layers.flatten(latent) # Based on the action space, will select what probability distribution type self.pdtype = make_pdtype(env.action_space) self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01) # Take an action self.action = self.pd.sample() # Calculate the neg log of our probability self.neglogp = self.pd.neglogp(self.action) self.sess = sess or tf.get_default_session() self.vf_latent = vf_latent if estimate_q: assert isinstance(env.action_space, gym.spaces.Discrete) self.q = fc(vf_latent, 'q', env.action_space.n) self.vf = self.q else: # value network batch_count = vf_latent.get_shape()[0].value train_switch = True if 1 != batch_count: train_switch = True my_initializer = tf.contrib.layers.xavier_initializer() nin = vf_latent.get_shape()[1].value fc1_W_v = tf.get_variable(shape=[nin, 1], name='value_head_weight', trainable=train_switch, initializer=my_initializer) fc1_b_v = tf.get_variable(shape=[1], name='value_head_bias', trainable=train_switch, initializer=tf.constant_initializer(0)) tf.summary.histogram("value_head_weight", fc1_W_v) tf.summary.histogram("value_head_bias", fc1_b_v) self.vf = tf.matmul(vf_latent, fc1_W_v) + fc1_b_v #self.vf = fc(vf_latent, 'vf_weights', 1) # gerry_wonder self.vf = self.vf[:, 0] self.summary_tensor = None self.summary_writer = None self.step_id = 0
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, is_discrete=True): #pylint: disable=W0613 if isinstance(ac_space, gym.spaces.Discrete): self.is_discrete = True else: self.is_discrete = False print("nbatch%d" % (nbatch)) nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) if self.is_discrete: nact = ac_space.n else: nact = ac_space.shape[0] X = tf.placeholder(tf.uint8, ob_shape) #obs with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) pi = fc(h4, 'pi', nact, init_scale=0.01) vf = fc(h4, 'v', 1)[:, 0] if not self.is_discrete: logstd = tf.get_variable(name="logstd", shape=[1, nact], initializer=tf.zeros_initializer()) self.pdtype = make_pdtype(ac_space) if self.is_discrete: self.pd = self.pdtype.pdfromflat(pi) a0 = self.pd.sample() else: pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparam) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) assert (a.shape[0] == 1 ) # make sure a = a[0] don't throw away actions a = a[0] return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def _init(self, ob_space, sensor_space, ac_space, hid_size, num_hid_layers, kind): assert isinstance(ob_space, gym.spaces.Box) assert isinstance(sensor_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) ob_sensor = U.get_placeholder(name="ob_sensor", dtype=tf.float32, shape=[sequence_length] + list(sensor_space.shape)) ## Obfilter on sensor output with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=sensor_space.shape) obz_sensor = tf.clip_by_value( (ob_sensor - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) ## Adapted from mlp_policy last_out = obz_sensor for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense(last_out, hid_size, name="vffc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) y = tf.layers.dense(last_out, 64, name="vffinal", kernel_initializer=U.normc_initializer(1.0)) #y = ob_sensor #y = obz_sensor #y = tf.nn.relu(U.dense(y, 64, 'lin_ob', U.normc_initializer(1.0))) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu( tf.layers.dense(x, 64, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError print(x.shape, y.shape) x = tf.concat([x, y], 1) ## Saver # self.saver = tf.train.Saver() logits = tf.layers.dense(x, pdtype.param_shape()[0], name="logits", kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense( x, 1, name="value", kernel_initializer=U.normc_initializer(1.0))[:, 0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob, ob_sensor], [ac, self.vpred, logits])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, bound_by_sigmoid=False, sigmoid_coef=1., activation='tanh', normalize_obs=True, actions='gaussian', avg_norm_symmetry=False, symmetric_interpretation=False, stdclip=5.0, gaussian_bias=False, gaussian_from_binary=False, parallel_value=False, pv_layers=2, pv_hid_size=512, three=False): assert isinstance(ob_space, gym.spaces.Box) if actions == 'binary': self.pdtype = pdtype = MultiCategoricalPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32)) elif actions == 'beta': self.pdtype = pdtype = BetaPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32)) elif actions == 'bernoulli': self.pdtype = pdtype = BernoulliPdType(ac_space.low.size) elif actions == 'gaussian': self.pdtype = pdtype = make_pdtype(ac_space) elif actions == 'cat_3': self.pdtype = pdtype = MultiCategoricalPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32) * 2) elif actions == 'cat_5': self.pdtype = pdtype = MultiCategoricalPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32) * 4) else: assert False sequence_length = None self.ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.st = U.get_placeholder(name="st", dtype=tf.int32, shape=[None]) if normalize_obs: with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) if avg_norm_symmetry: # Warning works only for normal observations (41 numbers) ob_mean = (tf.gather(self.ob_rms.mean, ORIG_SYMMETRIC_IDS) + self.ob_rms.mean) / 2 ob_std = (tf.gather(self.ob_rms.std, ORIG_SYMMETRIC_IDS) + self.ob_rms.std) / 2 # Pretty crude else: ob_mean = self.ob_rms.mean ob_std = self.ob_rms.std obz = tf.clip_by_value((self.ob - ob_mean) / ob_std, -stdclip, stdclip) #obz = tf.Print(obz, [self.ob_rms.mean], message='rms_mean', summarize=41) #obz = tf.Print(obz, [self.ob_rms.std], message='rms_std', summarize=41) else: obz = self.ob vpreds = [] pparams = [] for part in range(1 if not three else 3): part_prefix = "" if part == 0 else "part_" + str(part) # Predicted value last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, part_prefix + "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) vpreds.append( U.dense(last_out, 1, part_prefix + "vffinal", weight_init=U.normc_initializer(1.0))) vpreds[-1] = vpreds[-1][:, 0] if parallel_value: last_out_2 = obz for i in range(pv_layers): last_out_2 = tf.nn.tanh( U.dense(last_out_2, pv_hid_size, part_prefix + "pv_vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out_2 = U.dense(last_out_2, 1, part_prefix + "pv_vffinal", weight_init=U.normc_initializer(1.0)) vpreds[-1] += last_out_2[:, 0] last_out = obz if activation == 'tanh': activation = tf.nn.tanh elif activation == 'relu': activation = tf.nn.relu for i in range(num_hid_layers): dense = U.dense(last_out, hid_size, part_prefix + "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0)) last_out = activation(dense) if actions == 'gaussian': if gaussian_fixed_var: mean = U.dense(last_out, pdtype.param_shape()[0] // 2, part_prefix + "polfinal", U.normc_initializer(0.01)) if bound_by_sigmoid: mean = tf.nn.sigmoid(mean * sigmoid_coef) logstd = tf.get_variable( name=part_prefix + "logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) logstd = mean * 0.0 + logstd else: mean = U.dense(last_out, pdtype.param_shape()[0] // 2, part_prefix + "polfinal", U.normc_initializer(0.01)) logstd = U.dense(last_out, pdtype.param_shape()[0] // 2, part_prefix + "polfinal_2", U.normc_initializer(0.01)) if gaussian_bias: mean = mean + 0.5 pdparam = U.concatenate([mean, logstd], axis=1) elif actions == 'beta': pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "beta_lastlayer", U.normc_initializer(0.01)) pdparam = tf.nn.softplus(pdparam) elif actions in ['bernoulli', 'binary']: if bound_by_sigmoid: raise NotImplementedError( "bound by sigmoid not implemented here") pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "polfinal", U.normc_initializer(0.01)) elif actions in ['cat_3']: pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "cat3_lastlayer", U.normc_initializer(0.01)) # prob = tf.reshape(pdparam, [18, -1]) # prob = tf.nn.softmax(prob) # elogit = tf.exp(pdparam) # pdparam = tf.Print(pdparam, [prob], summarize=18) elif actions in ['cat_5']: pdparam = U.dense(last_out, pdtype.param_shape()[0], part_prefix + "cat5_lastlayer", U.normc_initializer(0.01)) # prob = tf.reshape(pdparam, [18, -1]) # prob = tf.nn.softmax(prob) # elogit = tf.exp(pdparam) # pdparam = tf.Print(pdparam, [prob], summarize=18) else: assert False pparams.append(pdparam) pparams = tf.stack(pparams) vpreds = tf.stack(vpreds) pparams = tf.transpose(pparams, perm=(1, 0, 2)) # [batchsize, networks, values] vpreds = tf.transpose(vpreds, perm=(1, 0)) # [batchsize, networks, values] self.stochastic = tf.placeholder(name="stochastic", dtype=tf.bool, shape=()) if three: batchsize = tf.shape(pdparam)[0] NO_OBSTACLES_ID = 5 OBST_DIST = [278, 279, 280, 281, 282, 283, 284, 285] # TODO: Alternative approach distances = [self.ob[:, i] for i in OBST_DIST] distances = tf.stack(distances, axis=1) no_obstacles = tf.cast(tf.equal(self.ob[:, NO_OBSTACLES_ID], 1.0), tf.int32) distances = tf.cast(tf.reduce_all(tf.equal(distances, 3), axis=1), tf.int32) no_obstacles_ahead = distances * no_obstacles # 0 if obstacles, 1 if no obstacles begin = tf.cast(tf.less(self.st, 75), tf.int32) take_id = (1 - begin) * ( 1 + no_obstacles_ahead ) # begin==1 => 0, begin==0 => 1 + no_obstacles_ahead take_id = tf.stack((tf.range(batchsize), take_id), axis=1) pdparam = tf.gather_nd(pparams, take_id) self.vpred = tf.gather_nd(vpreds, take_id) #self.vpred = tf.Print(self.vpred, [take_id]) else: self.vpred = vpreds[:, 0] pdparam = pparams[:, 0] self.pd = pdtype.pdfromflat(pdparam) if hasattr(self.pd, 'real_mean'): real_mean = self.pd.real_mean() ac = U.switch(self.stochastic, self.pd.sample(), real_mean) else: ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([self.stochastic, self.ob, self.st], [ac, self.vpred, ob_mean, ob_std]) if actions == 'binary': self._binary_f = U.function([self.stochastic, self.ob, self.st], [ac, self.pd.flat, self.vpred])
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): # if Config.USE_COLOR_TRANSFORM: out_shape = processed_x.get_shape().as_list() mask_vbox = tf.Variable(tf.zeros_like(processed_x, dtype=bool), trainable=False) rh = .2 # hard-coded velocity box size # mh = tf.cast(tf.cast(out_shape[1], dtype=tf.float32)*rh, dtype=tf.int32) mh = int(out_shape[1]*rh) mw = mh*2 mask_vbox = mask_vbox[:,:mh,:mw].assign(tf.ones([out_shape[0], mh, mw, out_shape[-1]], dtype=bool)) masked = tf.where(mask_vbox, x=tf.zeros_like(processed_x), y=processed_x) # tf.image.adjust_brightness vs. ImageEnhance.Brightness # tf version is additive while PIL version is multiplicative delta_brightness = tf.get_variable( name='randprocess_brightness', initializer=tf.random_uniform([], -.5, .5), trainable=False) # tf.image.adjust_contrast vs. PIL.ImageEnhance.Contrast delta_contrast = tf.get_variable( name='randprocess_contrast', initializer=tf.random_uniform([], .5, 1.5), trainable=False,) # tf.image.adjust_saturation vs. PIL.ImageEnhance.Color delta_saturation = tf.get_variable( name='randprocess_saturation', initializer=tf.random_uniform([], .5, 1.5), trainable=False,) processed_x1 = tf.image.adjust_brightness(masked, delta_brightness) processed_x1 = tf.clip_by_value(processed_x1, 0., 255.) processed_x1 = tf.where(mask_vbox, x=masked, y=processed_x1) processed_x2 = tf.image.adjust_contrast(processed_x1, delta_contrast) processed_x2 = tf.clip_by_value(processed_x2, 0., 255.) processed_x2 = tf.where(mask_vbox, x=masked, y=processed_x2) processed_x3 = tf.image.adjust_saturation(processed_x2, delta_saturation) processed_x3 = tf.clip_by_value(processed_x3, 0., 255.) processed_x3 = tf.where(mask_vbox, x=processed_x, y=processed_x3) else: processed_x3 = processed_x # h, self.dropout_assign_ops = choose_cnn(processed_x3) vf = fc(h, 'v', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors): """ Parameters: ---------- env RL environment observations tensorflow placeholder in which the observations will be fed 策略网络的隐层,只是定义了计算图 latent latent state from which policy distribution parameters should be inferred 值网络的隐层,如果共享隐层,策略网络的隐层 and 值网络的隐层相同 vf_latent latent state from which value function should be inferred (if None, then latent is used) sess tensorflow session to run calculations in (if None, default session is used) **tensors tensorflow tensors for additional attributes such as state or mask """ self.X = observations self.state = tf.constant([]) self.initial_state = None self.__dict__.update(tensors) vf_latent = vf_latent if vf_latent is not None else latent # flatten 除了第一维,其他维展平 vf_latent = tf.layers.flatten(vf_latent) latent = tf.layers.flatten(latent) #print('000000000000000000000000000000000000000000000000000000000000') #print(latent) # Based on the action space, will select what probability distribution type self.pdtype = make_pdtype(env.action_space) ''' # pdtype根据不同action类型在隐层后接全连接 # self.pd 是保存了动作概率分布的类,封装了很多方法可以用来计算与分布相关的量,比如下边计算的sample和neglogp ''' self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01) # 根据概率分布采样动作 # Take an action self.action = self.pd.sample() # 计算所采样动作的负log # Calculate the neg log of our probability self.neglogp = self.pd.neglogp(self.action) self.sess = sess or tf.get_default_session() if estimate_q: assert isinstance(env.action_space, gym.spaces.Discrete) self.q = fc(vf_latent, 'q', env.action_space.n) self.vf = self.q else: # 每个状态只有一个V值 self.vf = fc(vf_latent, 'vf', 1) self.vf = self.vf[:, 0]
def __init__(self, sess, ob_space, action_space, nbatch, nsteps, reuse = False): # This will use to initialize our kernels gain = np.sqrt(2) # Based on the action space, will select what probability distribution type # we will use to distribute action in our stochastic policy (in our case DiagGaussianPdType # aka Diagonal Gaussian, 3D normal distribution self.pdtype = make_pdtype(action_space) height, weight, channel = ob_space.shape ob_shape = (height, weight, channel) # Create the input placeholder inputs_ = tf.placeholder(tf.float32, [None, *ob_shape], name="input") # Normalize the images scaled_images = tf.cast(inputs_, tf.float32) / 255. """ Build the model 3 CNN for spatial dependencies Temporal dependencies is handle by stacking frames (Something funny nobody use LSTM in OpenAI Retro contest) 1 common FC 1 FC for policy 1 FC for value """ with tf.variable_scope("model", reuse = reuse): conv1 = conv_layer(scaled_images, 32, 8, 4, gain) conv2 = conv_layer(conv1, 64, 4, 2, gain) conv3 = conv_layer(conv2, 64, 3, 1, gain) flatten1 = tf.layers.flatten(conv3) fc_common = fc_layer(flatten1, 512, gain=gain) # This build a fc connected layer that returns a probability distribution # over actions (self.pd) and our pi logits (self.pi). self.pd, self.pi = self.pdtype.pdfromlatent(fc_common, init_scale=0.01) # Calculate the v(s) vf = fc_layer(fc_common, 1, activation_fn=None)[:, 0] self.initial_state = None # Take an action in the action distribution (remember we are in a situation # of stochastic policy so we don't always take the action with the highest probability # for instance if we have 2 actions 0.7 and 0.3 we have 30% chance to take the second) a0 = self.pd.sample() # Function use to take a step returns action to take and V(s) def step(state_in, *_args, **_kwargs): action, value = sess.run([a0, vf], {inputs_: state_in}) #print("step", action) return action, value # Function that calculates only the V(s) def value(state_in, *_args, **_kwargs): return sess.run(vf, {inputs_: state_in}) # Function that output only the action to take def select_action(state_in, *_args, **_kwargs): return sess.run(a0, {inputs_: state_in}) self.inputs_ = inputs_ self.vf = vf self.step = step self.value = value self.select_action = select_action