def __init__(self, name, observation_shape, action_shape, hid_size, num_hid_layers, stochastic=True): with tf.variable_scope(name): self.stochastic = stochastic self.hid_size, self.num_hid_layers = hid_size, num_hid_layers self.action_shape, self.observation_shape = action_shape, observation_shape self.scope = tf.get_variable_scope().name self.pdtype = DiagGaussianPdType(action_shape[0]) observations_ph = U.get_placeholder(name='ob', dtype=tf.float32, shape=[None] + list(observation_shape)) stochastic_ph = tf.placeholder(dtype=tf.bool, shape=()) with tf.variable_scope('obfilter'): self.ob_rms = RunningMeanStd(shape=observation_shape) with tf.variable_scope('pol'): last_out = tf.clip_by_value( (observations_ph - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) mean = tf.layers.dense( last_out, self.pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name='logstd', shape=[1, self.pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparam) action_op = U.switch(stochastic_ph, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic_ph, observations_ph], action_op)
def make_pdtype(ac_space): from cadm import spaces as custom_spaces from gym import spaces if isinstance(ac_space, custom_spaces.Box): assert len(ac_space.shape) == 1 return DiagGaussianPdType(ac_space.shape[0]) elif isinstance(ac_space, spaces.Box): assert len(ac_space.shape) == 1 return DiagGaussianPdType(ac_space.shape[0]) elif isinstance(ac_space, spaces.Discrete): return CategoricalPdType(ac_space.n) elif isinstance(ac_space, spaces.MultiDiscrete): return MultiCategoricalPdType(ac_space.nvec) elif isinstance(ac_space, spaces.MultiBinary): return BernoulliPdType(ac_space.n) else: raise NotImplementedError
def _init(self, ob_space, ac_space, hid_layers=[], deterministic=True, diagonal=True, trainable_std=True, use_bias=True, use_critic=False, seed=None, verbose=True, hidden_W_init=U.normc_initializer(1.0), higher_mean_init=None, higher_logstd_init=tf.constant_initializer(np.log(0.11)), const_std_init=False): """Params: ob_space: task observation space ac_space : task action space hid__layers: list with width of each hidden layer deterministic: whether the actor is deterministic diagonal: whether the higher order policy has a diagonal covariance matrix use_bias: whether to include bias in neurons use_critic: whether to include a critic network seed: optional random seed """ # Check environment's shapes assert isinstance(ob_space, gym.spaces.Box) assert len(ac_space.shape) == 1 # Set seed if seed is not None: set_global_seeds(seed) # Set some attributes self.diagonal = diagonal self.use_bias = use_bias batch_length = None # Accepts a sequence of eps of arbitrary length self.ac_dim = ac_space.shape[0] self.ob_dim = ob_space.shape[0] self.linear = not hid_layers self.verbose = verbose self._ob = ob = U.get_placeholder( name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) # Actor (N.B.: weight initialization is irrelevant) with tf.variable_scope('actor'): last_out = ob for i, hid_size in enumerate(hid_layers): # Mlp feature extraction last_out = tf.nn.tanh( tf.layers.dense(last_out, hid_size, name='fc%i' % (i+1), kernel_initializer=hidden_W_init, use_bias=use_bias)) if deterministic and isinstance(ac_space, gym.spaces.Box): # Determinisitc action selection self.actor_mean = actor_mean = \ tf.layers.dense(last_out, ac_space.shape[0], name='action', kernel_initializer=hidden_W_init, use_bias=use_bias) else: raise NotImplementedError # Get actor flatten weights with tf.variable_scope('actor') as scope: self.actor_weights = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name) # flatten weights self.flat_actor_weights = tf.concat( [tf.reshape(w, [-1]) for w in self.actor_weights], axis=0) self._n_actor_weights = n_actor_weights = \ self.flat_actor_weights.shape[0] # Higher order policy (Gaussian) with tf.variable_scope('higher'): if higher_mean_init is None: # Initial means sampled from a normal distribution N(0,1) higher_mean_init = tf.where( tf.not_equal(self.flat_actor_weights, tf.constant(0, dtype=tf.float32)), tf.random_normal(shape=[n_actor_weights.value], stddev=0.01), tf.zeros(shape=[n_actor_weights])) # bias init always zero self.higher_mean = tf.get_variable( name='higher_mean', initializer=higher_mean_init, shape=self.flat_actor_weights.get_shape()) # Keep the weights'domain compact # self.higher_mean = higher_mean = tf.clip_by_value( # self.higher_mean, -1, 1, 'higher_mean_clipped') higher_mean = self.higher_mean if diagonal: if const_std_init: self.higher_logstd = higher_logstd = \ tf.get_variable( name='higher_logstd', initializer=higher_logstd_init, trainable=trainable_std) else: self.higher_logstd = higher_logstd = \ tf.get_variable( name='higher_logstd', shape=[n_actor_weights], initializer=higher_logstd_init, trainable=trainable_std) pdparam = tf.concat([higher_mean, higher_mean * 0. + higher_logstd], axis=0) self.pdtype = pdtype = \ DiagGaussianPdType(n_actor_weights.value) else: # Cholesky covariance matrix self.higher_logstd = higher_logstd = tf.get_variable( name='higher_logstd', shape=[n_actor_weights*(n_actor_weights + 1)//2], initializer=tf.initializers.constant(0.)) pdparam = tf.concat([higher_mean, higher_logstd], axis=0) self.pdtype = pdtype = CholeskyGaussianPdType( n_actor_weights.value) # Sample actor weights self.pd = pdtype.pdfromflat(pdparam) sampled_actor_params = self.pd.sample() symm_sampled_actor_params = self.pd.sample_symmetric() self._sample_actor_params = U.function([], [sampled_actor_params]) self._sample_symm_actor_params = U.function( [], list(symm_sampled_actor_params)) # Assign actor weights with tf.variable_scope('actor') as scope: actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name) self._use_sampled_actor_params = \ U.assignFromFlat(actor_params, sampled_actor_params) self._get_actor_params = U.GetFlat(actor_params) self._set_actor_params = U.SetFromFlat(actor_params) # Act self._action = action = actor_mean self._act = U.function([ob], [action]) # Manage higher policy weights with tf.variable_scope('higher') as scope: self._higher_params = higher_params = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name) self.flat_higher_params = tf.concat([tf.reshape(w, [-1]) for w in self._higher_params], axis=0) self._n_higher_params = self.flat_higher_params.shape[0] self._get_flat_higher_params = U.GetFlat(higher_params) self._set_higher_params = U.SetFromFlat(self._higher_params) # Evaluating self._actor_params_in = actor_params_in = \ U.get_placeholder(name='actor_params_in', dtype=tf.float32, shape=[batch_length] + [n_actor_weights]) self._rets_in = rets_in = \ U.get_placeholder(name='returns_in', dtype=tf.float32, shape=[batch_length]) ret_mean, ret_std = tf.nn.moments(rets_in, axes=[0]) self._get_ret_mean = U.function([self._rets_in], [ret_mean]) self._get_ret_std = U.function([self._rets_in], [ret_std]) self._logprobs = logprobs = self.pd.logp(actor_params_in) pgpe_times_n = U.flatgrad(logprobs*rets_in, higher_params) self._get_pgpe_times_n = U.function([actor_params_in, rets_in], [pgpe_times_n]) self._get_actor_mean = U.function([ob], [self.actor_mean]) self._get_higher_mean = U.function([ob], [self.higher_mean]) self._get_higher_std = U.function([], tf.exp([self.higher_logstd])) # Batch off-policy PGPE self._probs = tf.exp(logprobs) self._behavioral = None self._renyi_other = None # Renyi computation self._det_sigma = tf.exp(tf.reduce_sum(self.higher_logstd)) # Fisher computation (diagonal case) mean_fisher_diag = tf.exp(-2*self.higher_logstd) if trainable_std: cov_fisher_diag = mean_fisher_diag*0 + 2 self._fisher_diag = tf.concat( [mean_fisher_diag, cov_fisher_diag], axis=0) else: self._fisher_diag = mean_fisher_diag self._get_fisher_diag = U.function([], [self._fisher_diag])
def _init(self, ob_space, ac_space, hid_layers=[], deterministic=True, diagonal=True, use_bias=False, use_critic=False, seed=None, verbose=True, zero_init=False): """Params: ob_space: task observation space ac_space : task action space hid__layers: list with width of each hidden layer deterministic: whether the actor is deterministic diagonal: whether the higher order policy has a diagonal covariance matrix use_bias: whether to include bias in neurons use_critic: whether to include a critic network seed: optional random seed """ assert isinstance(ob_space, gym.spaces.Box) assert len(ac_space.shape) == 1 self.diagonal = diagonal self.use_bias = use_bias batch_length = None #Accepts a sequence of episodes of arbitrary length self.observation_space = ob_space self.action_space = ac_space self.ac_dim = ac_space.shape[0] self.ob_dim = ob_space.shape[0] self.hid_layers = hid_layers self.deterministic = deterministic self.use_critic = use_critic self.linear = not hid_layers self.verbose = verbose if seed is not None: set_global_seeds(seed) self._ob = ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape)) #Critic (normally not used) if use_critic: with tf.variable_scope('critic'): last_out = ob for i, hid_size in enumerate(hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] #Actor (N.B.: weight initialization is irrelevant) with tf.variable_scope('actor'): last_out = ob for i, hid_size in enumerate(hid_layers): #Mlp feature extraction last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=tf.initializers.constant(0.), use_bias=use_bias)) if deterministic and isinstance(ac_space, gym.spaces.Box): #Determinisitc action selection self.actor_mean = actor_mean = tf.layers.dense( last_out, ac_space.shape[0], name='action', kernel_initializer=tf.initializers.constant(0.), use_bias=use_bias) else: raise NotImplementedError #Currently supports only deterministic action policies #Higher order policy (Gaussian) with tf.variable_scope('actor') as scope: self.actor_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \ scope=scope.name) self.flat_actor_weights = tf.concat([tf.reshape(w, [-1]) for w in \ self.actor_weights], axis=0) #flatten self._n_actor_weights = n_actor_weights = self.flat_actor_weights.shape[ 0] with tf.variable_scope('higher'): if zero_init: higher_mean_init = tf.where( tf.not_equal(self.flat_actor_weights, tf.constant(0, dtype=tf.float32)), tf.zeros(shape=[n_actor_weights.value]), tf.zeros(shape=[n_actor_weights])) else: #Initial means sampled from a normal distribution N(0,1) higher_mean_init = tf.where( tf.not_equal(self.flat_actor_weights, tf.constant(0, dtype=tf.float32)), tf.random_normal(shape=[n_actor_weights.value], stddev=0.01), tf.zeros(shape=[n_actor_weights])) self.higher_mean = higher_mean = tf.get_variable( name='higher_mean', initializer=higher_mean_init) if diagonal: #Diagonal covariance matrix; all stds initialized to 0 self.higher_logstd = higher_logstd = tf.get_variable( name='higher_logstd', shape=[n_actor_weights], initializer=tf.initializers.constant(0.)) pdparam = tf.concat( [higher_mean, higher_mean * 0. + higher_logstd], axis=0) self.pdtype = pdtype = DiagGaussianPdType( n_actor_weights.value) else: #Cholesky covariance matrix self.higher_logstd = higher_logstd = tf.get_variable( name='higher_logstd', shape=[n_actor_weights * (n_actor_weights + 1) // 2], initializer=tf.initializers.constant(0.)) pdparam = tf.concat([higher_mean, higher_logstd], axis=0) self.pdtype = pdtype = CholeskyGaussianPdType( n_actor_weights.value) #Sample actor weights self.pd = pdtype.pdfromflat(pdparam) sampled_actor_params = self.pd.sample() symm_sampled_actor_params = self.pd.sample_symmetric() self._sample_symm_actor_params = U.function( [], list(symm_sampled_actor_params)) self._sample_actor_params = U.function([], [sampled_actor_params]) #Assign actor weights with tf.variable_scope('actor') as scope: actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \ scope=scope.name) self._use_sampled_actor_params = U.assignFromFlat( actor_params, sampled_actor_params) self._set_actor_params = U.SetFromFlat(actor_params) self._get_actor_params = U.GetFlat(actor_params) #Act self._action = action = actor_mean self._act = U.function([ob], [action]) #Higher policy weights with tf.variable_scope('higher') as scope: self._higher_params = higher_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \ scope=scope.name) self.flat_higher_params = tf.concat([tf.reshape(w, [-1]) for w in \ self._higher_params], axis=0) #flatten self._n_higher_params = self.flat_higher_params.shape[0] self._get_flat_higher_params = U.GetFlat(higher_params) self._set_higher_params = U.SetFromFlat(self._higher_params) #Batch PGPE self._actor_params_in = actor_params_in = \ U.get_placeholder(name='actor_params_in', dtype=tf.float32, shape=[batch_length] + [n_actor_weights]) self._rets_in = rets_in = U.get_placeholder(name='returns_in', dtype=tf.float32, shape=[batch_length]) ret_mean, ret_std = tf.nn.moments(rets_in, axes=[0]) self._get_ret_mean = U.function([self._rets_in], [ret_mean]) self._get_ret_std = U.function([self._rets_in], [ret_std]) self._logprobs = logprobs = self.pd.logp(actor_params_in) pgpe_times_n = U.flatgrad(logprobs * rets_in, higher_params) self._get_pgpe_times_n = U.function([actor_params_in, rets_in], [pgpe_times_n]) #One-episode PGPE #Used N times to compute the baseline -> can we do better? self._one_actor_param_in = one_actor_param_in = U.get_placeholder( name='one_actor_param_in', dtype=tf.float32, shape=[n_actor_weights]) one_logprob = self.pd.logp(one_actor_param_in) score = U.flatgrad(one_logprob, higher_params) score_norm = tf.norm(score) self._get_score = U.function([one_actor_param_in], [score]) self._get_score_norm = U.function([one_actor_param_in], [score_norm]) #Batch off-policy PGPE self._probs = tf.exp(logprobs) self._behavioral = None self._renyi_other = None #One episode off-PGPE self._one_prob = tf.exp(one_logprob) #Renyi computation self._det_sigma = tf.exp(tf.reduce_sum(self.higher_logstd)) #Fisher computation (diagonal case) mean_fisher_diag = tf.exp(-2 * self.higher_logstd) cov_fisher_diag = mean_fisher_diag * 0 + 2 self._fisher_diag = tf.concat([mean_fisher_diag, cov_fisher_diag], axis=0) self._get_fisher_diag = U.function([], [self._fisher_diag]) #Multiple importance sampling self._memory = None
def _init(self, np_random, flavor, dim, hid_size=32, n_hid=2, alpha_sysid=0.1, test=False): print("obs dim:", dim.ob) # inputs & hyperparameters self.flavor = flavor self.dim = dim self.alpha_sysid = alpha_sysid self.ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=(None, dim.ob_concat)) self.ob_traj = U.get_placeholder(name="ob_traj", dtype=tf.float32, shape=[None, dim.window, dim.ob]) self.ac_traj = U.get_placeholder(name="ac_traj", dtype=tf.float32, shape=[None, dim.window, dim.ac]) # regular inputs whitening ob, sysid = tf.split(self.ob, [dim.ob, dim.sysid], axis=1) with tf.variable_scope("ob_filter"): self.ob_rms = RunningMeanStd(shape=(dim.ob_concat)) obz_all = tf.clip_by_value( (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0, name="ob_normalizer") obz, sysidz = tf.split(obz_all, [dim.ob, dim.sysid], axis=1) print("obz dim:", obz.shape, "sysidz dim:", sysidz.shape) with tf.variable_scope("ob_white"): obz = tf.identity(obz) with tf.variable_scope("sysid_white"): self.sysidz = tf.identity(sysidz) # trajectory inputs for SysID # NOTE: the environment should be defined such that # actions are relatively close to Normal(0,1) ob_trajz = tf.clip_by_value( (self.ob_traj - self.ob_rms.mean[:dim.ob]) / self.ob_rms.std[:dim.ob], -5.0, 5.0, name="ob_traj_white") trajs = tf.concat([ob_trajz, self.ac_traj], axis=2) # these rewards will be optimized via direct gradient-based optimization # (not RL reward), in the same place as e.g. the entropy regularization self.extra_rewards = [] self.extra_reward_names = [] with tf.variable_scope("sysid"): if flavor == PLAIN: self.traj2sysid = sysid_convnet(np_random, trajs, dim.sysid) elif flavor == EXTRA: self.traj2sysid = sysid_convnet(np_random, trajs, dim.sysid) elif flavor == EMBED: self.traj2embed = sysid_convnet(np_random, trajs, dim.embed) EMBED_N_HID = 2 EMBED_HID_SZ = 2 * dim.sysid # policy with tf.variable_scope("pol"): if flavor == BLIND: policy_input = obz self.sysid_err_supervised = tf.constant(0.0) elif flavor == PLAIN: self.sysid_err_supervised = tf.losses.mean_squared_error( tf.stop_gradient(sysidz), self.traj2sysid) policy_input = tf.concat([obz, self.traj2sysid ]) if test else obz_all elif flavor == EXTRA: sysid_processor_input = self.traj2sysid if test else sysidz sysid_processor = MLPModule(np_random, sysid_processor_input, EMBED_N_HID, EMBED_HID_SZ, 1.0, dim.embed, "sysid_processor") policy_input = tf.concat([obz, sysid_processor], axis=1, name="input_concat") self.sysid_err_supervised = tf.losses.mean_squared_error( tf.stop_gradient(sysidz), self.traj2sysid) elif flavor == EMBED: self.embed = MLPModule(np_random, sysidz, EMBED_N_HID, EMBED_HID_SZ, 1.0, dim.embed, "embed") embed_input = self.traj2embed if test else self.embed policy_input = tf.concat([obz, embed_input], axis=1, name="input_concat") self.sysid_err_supervised = tf.losses.mean_squared_error( tf.stop_gradient(self.embed), self.traj2embed) mean, var = tf.nn.moments(self.embed, 0) dist = tf.distributions.Normal(loc=mean, scale=tf.sqrt(var)) std_dist = tf.distributions.Normal(loc=0.0, scale=1.0) embed_KL = tf.reduce_mean( tf.distributions.kl_divergence(dist, std_dist)) self.extra_rewards.append(-0.1 * embed_KL) self.extra_reward_names.append("neg_embed_KL") elif flavor == TRAJ: self.traj_conv = sysid_convnet(np_random, trajs, dim.embed) policy_input = tf.concat([obz, self.traj_conv], axis=1, name="input_concat") self.sysid_err_supervised = tf.constant(0.0) else: raise ValueError("flavor '{}' does not exist".format(flavor)) # main policy MLP. outputs mean and logstd of stochastic Gaussian policy with tf.variable_scope("policy"): print("policy input dimensionality:", policy_input.get_shape().as_list()) mean = MLPModule(np_random, policy_input, n_hid, hid_size, 0.01, dim.ac, "pol") logstd = tf.maximum( tf.get_variable(name="logstd", shape=[1, dim.ac], initializer=tf.constant_initializer(-0.3)), -1.0) with tf.variable_scope("policy_to_gaussian"): pdparam = tf.concat([mean, mean * 0.0 + logstd], 1) self.pdtype = DiagGaussianPdType(dim.ac) self.pd = self.pdtype.pdfromflat(pdparam) # value function with tf.variable_scope("vf"): self.vpred = MLPModule(np_random, tf.stop_gradient(policy_input), n_hid, hid_size, 0.1, 1, "vf")[:, 0] # switch between stochastic and deterministic policy with tf.variable_scope("stochastic_switch"): self.stochastic = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic") self.ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode()) # function we'll call when interacting with environment self._act = U.function([self.stochastic, self.ob], [self.ac, self.vpred]) # for test time, the trajectory is fed in self._act_traj = U.function( [self.stochastic, self.ob, self.ob_traj, self.ac_traj], [self.ac, self.vpred])
class SysIDPolicy(object): recurrent = False def __init__(self, name, *args, **kwargs): with tf.variable_scope(name): self._init(*args, **kwargs) self.scope = tf.get_variable_scope().name # set up the network # NOTE: due to normalization of SysID values and KL-regularization of embedding space, # alpha_sysid shouldn't need to vary between environments - but we'll see... def _init(self, np_random, flavor, dim, hid_size=32, n_hid=2, alpha_sysid=0.1, test=False): print("obs dim:", dim.ob) # inputs & hyperparameters self.flavor = flavor self.dim = dim self.alpha_sysid = alpha_sysid self.ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=(None, dim.ob_concat)) self.ob_traj = U.get_placeholder(name="ob_traj", dtype=tf.float32, shape=[None, dim.window, dim.ob]) self.ac_traj = U.get_placeholder(name="ac_traj", dtype=tf.float32, shape=[None, dim.window, dim.ac]) # regular inputs whitening ob, sysid = tf.split(self.ob, [dim.ob, dim.sysid], axis=1) with tf.variable_scope("ob_filter"): self.ob_rms = RunningMeanStd(shape=(dim.ob_concat)) obz_all = tf.clip_by_value( (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0, name="ob_normalizer") obz, sysidz = tf.split(obz_all, [dim.ob, dim.sysid], axis=1) print("obz dim:", obz.shape, "sysidz dim:", sysidz.shape) with tf.variable_scope("ob_white"): obz = tf.identity(obz) with tf.variable_scope("sysid_white"): self.sysidz = tf.identity(sysidz) # trajectory inputs for SysID # NOTE: the environment should be defined such that # actions are relatively close to Normal(0,1) ob_trajz = tf.clip_by_value( (self.ob_traj - self.ob_rms.mean[:dim.ob]) / self.ob_rms.std[:dim.ob], -5.0, 5.0, name="ob_traj_white") trajs = tf.concat([ob_trajz, self.ac_traj], axis=2) # these rewards will be optimized via direct gradient-based optimization # (not RL reward), in the same place as e.g. the entropy regularization self.extra_rewards = [] self.extra_reward_names = [] with tf.variable_scope("sysid"): if flavor == PLAIN: self.traj2sysid = sysid_convnet(np_random, trajs, dim.sysid) elif flavor == EXTRA: self.traj2sysid = sysid_convnet(np_random, trajs, dim.sysid) elif flavor == EMBED: self.traj2embed = sysid_convnet(np_random, trajs, dim.embed) EMBED_N_HID = 2 EMBED_HID_SZ = 2 * dim.sysid # policy with tf.variable_scope("pol"): if flavor == BLIND: policy_input = obz self.sysid_err_supervised = tf.constant(0.0) elif flavor == PLAIN: self.sysid_err_supervised = tf.losses.mean_squared_error( tf.stop_gradient(sysidz), self.traj2sysid) policy_input = tf.concat([obz, self.traj2sysid ]) if test else obz_all elif flavor == EXTRA: sysid_processor_input = self.traj2sysid if test else sysidz sysid_processor = MLPModule(np_random, sysid_processor_input, EMBED_N_HID, EMBED_HID_SZ, 1.0, dim.embed, "sysid_processor") policy_input = tf.concat([obz, sysid_processor], axis=1, name="input_concat") self.sysid_err_supervised = tf.losses.mean_squared_error( tf.stop_gradient(sysidz), self.traj2sysid) elif flavor == EMBED: self.embed = MLPModule(np_random, sysidz, EMBED_N_HID, EMBED_HID_SZ, 1.0, dim.embed, "embed") embed_input = self.traj2embed if test else self.embed policy_input = tf.concat([obz, embed_input], axis=1, name="input_concat") self.sysid_err_supervised = tf.losses.mean_squared_error( tf.stop_gradient(self.embed), self.traj2embed) mean, var = tf.nn.moments(self.embed, 0) dist = tf.distributions.Normal(loc=mean, scale=tf.sqrt(var)) std_dist = tf.distributions.Normal(loc=0.0, scale=1.0) embed_KL = tf.reduce_mean( tf.distributions.kl_divergence(dist, std_dist)) self.extra_rewards.append(-0.1 * embed_KL) self.extra_reward_names.append("neg_embed_KL") elif flavor == TRAJ: self.traj_conv = sysid_convnet(np_random, trajs, dim.embed) policy_input = tf.concat([obz, self.traj_conv], axis=1, name="input_concat") self.sysid_err_supervised = tf.constant(0.0) else: raise ValueError("flavor '{}' does not exist".format(flavor)) # main policy MLP. outputs mean and logstd of stochastic Gaussian policy with tf.variable_scope("policy"): print("policy input dimensionality:", policy_input.get_shape().as_list()) mean = MLPModule(np_random, policy_input, n_hid, hid_size, 0.01, dim.ac, "pol") logstd = tf.maximum( tf.get_variable(name="logstd", shape=[1, dim.ac], initializer=tf.constant_initializer(-0.3)), -1.0) with tf.variable_scope("policy_to_gaussian"): pdparam = tf.concat([mean, mean * 0.0 + logstd], 1) self.pdtype = DiagGaussianPdType(dim.ac) self.pd = self.pdtype.pdfromflat(pdparam) # value function with tf.variable_scope("vf"): self.vpred = MLPModule(np_random, tf.stop_gradient(policy_input), n_hid, hid_size, 0.1, 1, "vf")[:, 0] # switch between stochastic and deterministic policy with tf.variable_scope("stochastic_switch"): self.stochastic = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic") self.ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode()) # function we'll call when interacting with environment self._act = U.function([self.stochastic, self.ob], [self.ac, self.vpred]) # for test time, the trajectory is fed in self._act_traj = U.function( [self.stochastic, self.ob, self.ob_traj, self.ac_traj], [self.ac, self.vpred]) # given the actual dynamics parameters, compute the embedding def sysid_to_embedded(self, sysid_vals): if self.flavor in [BLIND, TRAJ]: # could also just return sysid_vals, but this draws attention to lack of sysid return 0 * sysid_vals # pass val[None,:] if needing to evaluate for just one sysid val assert len(sysid_vals.shape) == 2 k = sysid_vals.shape[0] sysid_vals = np.concatenate([np.zeros((k, self.dim.ob)), sysid_vals], axis=1) sess = tf.get_default_session() if self.flavor == EMBED: embed = sess.run(self.embed, feed_dict={self.ob: sysid_vals}) return embed else: sysidz = sess.run(self.sysidz, feed_dict={self.ob: sysid_vals}) return sysidz # given the ob/ac trajectories, estimate the embedding. # it's also part of the main policy, but needed on its own for TRPO. def estimate_sysid(self, ob_trajs, ac_trajs): sess = tf.get_default_session() N = ob_trajs.shape[0] k = N // 2048 + 1 if self.flavor in [BLIND, TRAJ]: return np.zeros((N, self.dim.sysid)) # TODO use tf.data or something to do this automatically! def gen(ob_splits, ac_splits): for o, a in zip(ob_splits, ac_splits): feed = { self.ob_traj: o, self.ac_traj: a, } if self.flavor == EMBED: yield sess.run(self.traj2embed, feed_dict=feed) else: yield sess.run(self.traj2sysid, feed_dict=feed) est = np.vstack( gen(np.array_split(ob_trajs, k), np.array_split(ac_trajs, k))) return est # act - ob is concat(ob, sysid) def act(self, stochastic, ob): ac1, vpred1 = self._act(stochastic, ob) return ac1, vpred1 def act_traj(self, stochastic, ob, ob_traj, ac_traj): return self._act_traj(stochastic, ob, ob_traj, ac_traj) # for OpenAI Baselines compatibility def get_variables(self): return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) def get_trainable_variables(self): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) def get_initial_state(self): return []
class MlpPolicy(object): """A multilayer perceptron to map state observations into actions. Note: The last layer of this network parameterises a diagonal gaussian distribution so the output can be stochstic by sampling from the distribution, or deterministic by taking the mean. Args: name: Name of the scope under which to delare all the network's tf variables observation_shape: Shape of the observation space action_shape: Shape of the action space hid_size: Number of neurons per hidden layer num_hid_layers: Number of hidden layers stochastic: Whether to sample the output distribution or take its mean when generating actions """ def __init__(self, name, observation_shape, action_shape, hid_size, num_hid_layers, stochastic=True): with tf.variable_scope(name): self.stochastic = stochastic self.hid_size, self.num_hid_layers = hid_size, num_hid_layers self.action_shape, self.observation_shape = action_shape, observation_shape self.scope = tf.get_variable_scope().name self.pdtype = DiagGaussianPdType(action_shape[0]) observations_ph = U.get_placeholder(name='ob', dtype=tf.float32, shape=[None] + list(observation_shape)) stochastic_ph = tf.placeholder(dtype=tf.bool, shape=()) with tf.variable_scope('obfilter'): self.ob_rms = RunningMeanStd(shape=observation_shape) with tf.variable_scope('pol'): last_out = tf.clip_by_value( (observations_ph - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) mean = tf.layers.dense( last_out, self.pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name='logstd', shape=[1, self.pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparam) action_op = U.switch(stochastic_ph, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic_ph, observations_ph], action_op) def act(self, observation): """Convenience function for generating a single action given an observation Args: observation: A state observation """ return self._act(self.stochastic, np.array(observation)[None])[0] def get_variables(self): """Gets all the tf variables associated with this network.""" return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) def get_trainable_variables(self): """Gets all the trainable tf variables associated with this network.""" return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) def make_target_network(self, name): """Creates a network which periodically updates its weights by copying them from this network. Args: name: Name of the scope under which to delare all the target network's tf variables """ return TargetMlpPolicy(name, self)