def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, scope="policy", nlstm=256): if layernormalize: print( "Warning: policy is operating on top of layer-normed features. It might slow down the training." ) self.layernormalize = layernormalize self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype(ac_space) self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(None, None) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac') self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope pdparamsize = self.ac_pdtype.param_shape()[0] sh = tf.shape(self.ph_ob) x = flatten_two_dims(self.ph_ob) self.flat_features = self.get_features(x, reuse=False) self.features = unflatten_first_dim(self.flat_features, sh) with tf.variable_scope(scope, reuse=False): # h, self.dropout_assign_ops = choose_cnn(processed_x) # xs = batch_to_seq(h, nenv, nsteps) # ms = batch_to_seq(M, nenv, nsteps) # h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) # h5 = seq_to_batch(h5) # vf = fc(h5, 'v', 1)[:,0] # self.pd, self.pi = self.pdtype.pdfromlatent(h5) x = fc(self.flat_features, units=hidsize, activation=activ) x = fc(x, units=hidsize, activation=activ) pdparam = fc(x, name='pd', units=pdparamsize, activation=None) vpred = fc(x, name='value_function_output', units=1, activation=None) pdparam = unflatten_first_dim(pdparam, sh) self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0] self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp)
def __init__(self, ob_space, ac_space, hidsize, feat_dim, layernormalize, nl, scope="policy"): if layernormalize: print("Warning: policy is operating on top of layer-normed features. It might slow down the training.") self.layernormalize = layernormalize self.nl = nl with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype(ac_space) self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(None, None) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac') self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope pdparamsize = self.ac_pdtype.param_shape()[0] sh = tf.shape(self.ph_ob) x = flatten_two_dims(self.ph_ob) self.flat_features = self.get_features(x, reuse=False) self.features = unflatten_first_dim(self.flat_features, sh) with tf.variable_scope(scope, reuse=False): x = fc(self.flat_features, units=hidsize, activation=activ) x = fc(x, units=hidsize, activation=activ) pdparam = fc(x, name='pd', units=pdparamsize, activation=None) vpred = fc(x, name='value_function_output', units=1, activation=None) pdparam = unflatten_first_dim(pdparam, sh) self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0] self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp)
def get_loss(self): nl = tf.nn.leaky_relu ac = tf.one_hot(self.ac, self.ac_space.n, axis=2) sh = tf.shape(ac) ac = flatten_two_dims(ac) ac_four_dim = tf.expand_dims(tf.expand_dims(ac, 1), 1) def add_ac(x): if x.get_shape().ndims == 2: return tf.concat([x, ac], axis=-1) elif x.get_shape().ndims == 4: sh = tf.shape(x) return tf.concat( [ x, ac_four_dim + tf.zeros( [ sh[0], sh[1], sh[2], ac_four_dim.get_shape()[3].value ], tf.float32, ), ], axis=-1, ) with tf.variable_scope(self.scope): x = flatten_two_dims(self.features) mu, log_sigma_squared = unet(x, nl=nl, feat_dim=self.feat_dim, cond=add_ac) mu = unflatten_first_dim(mu, sh) log_sigma_squared = unflatten_first_dim(log_sigma_squared, sh) prediction_pixels = mu * self.ob_std + self.ob_mean if self.ama == "true": mse = tf.square(mu - 2 * tf.stop_gradient(self.out_features)) dynamics_reward = tf.reduce_mean((mse - tf.exp(log_sigma_squared)), axis=[2, 3, 4]) if self.clip_ama == "true": dynamics_reward = tf.clip_by_value(dynamics_reward, 0, 1e6) loss = tf.reduce_mean( (tf.exp(-log_sigma_squared) * (mse) + self.uncertainty_penalty * log_sigma_squared), axis=[2, 3, 4], ) elif self.ama == "false": mse = tf.square(mu - tf.stop_gradient(self.out_features)) dynamics_reward = tf.reduce_mean(mse, axis=[2, 3, 4]) loss = dynamics_reward else: raise ValueError("Please specify whether to use AMA or not") return ( loss, dynamics_reward, prediction_pixels, log_sigma_squared, )
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, scope='policy'): self.layernormalize = layernormalize self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std self.hidsize = hidsize self.feat_dim = feat_dim with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype(ac_space) self.placeholder_observation = tf.placeholder(dtype=tf.int32, shape=(None, None) + ob_space.shape, name='observation') self.placeholder_action = self.ac_pdtype.sample_placeholder( [None, None], name='action') self.pd = self.vpred = None self.scope = scope pdparamsize = self.ac_pdtype.param_shape()[0] shape = tf.shape(self.placeholder_observation) x = flatten_two_dims(self.placeholder_observation) self.flat_features = self.get_features(x, reuse=False) self.features = unflatten_first_dim(self.flat_features, shape) with tf.variable_scope(scope, reuse=False): x = fc(self.flat_features, units=hidsize, activation=activ) x = fc(x, units=hidsize, activation=activ) pdparam = fc(x, name='pd', units=pdparamsize, activation=None) value_pred = fc(x, name='value_func_output', units=1, activation=None) pdparam = unflatten_first_dim(pdparam, shape) self.vpred = unflatten_first_dim(value_pred, shape)[:, :, 0] self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp)
def decoder(self, z): # z 是VAE后验分布的均值, shape=(None,None,512) nl = tf.nn.leaky_relu z_has_timesteps = (z.get_shape().ndims == 3) if z_has_timesteps: sh = tf.shape(z) z = flatten_two_dims(z) # (None,512) with tf.variable_scope(self.scope + "decoder"): # 反卷积网络. de-convolution. spherical_obs=True, 输出 z.shape=(None,84,84,4) z = small_deconvnet(z, nl=nl, ch=4 if self.spherical_obs else 8, positional_bias=True) if z_has_timesteps: z = unflatten_first_dim(z, sh) if self.spherical_obs: # 球形损失, scale 在所有维度都是同一个常数, 简化运算 scale = tf.get_variable(name="scale", shape=(), dtype=tf.float32, initializer=tf.ones_initializer()) scale = tf.maximum(scale, -4.) scale = tf.nn.softplus(scale) scale = scale * tf.ones_like(z) else: z, scale = tf.split(z, 2, -1) # 输出 split, 分别作为 mu 和 scale. scale = tf.nn.softplus(scale) # scale = tf.Print(scale, [scale]) return tf.distributions.Normal(loc=z, scale=scale)
def predict_next(self, reuse): if isinstance(self.ac_space, gym.spaces.Discrete): ac = tf.one_hot(self.ac, get_action_n(self.ac_space), axis=2) else: ac = self.ac sh = tf.shape(ac) ac = flatten_two_dims(ac) def add_ac(x): return tf.concat([x, ac], axis=-1) with tf.variable_scope(self.scope, reuse=reuse): x = flatten_two_dims(self.features) x = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu) def residual(x): res = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu) res = tf.layers.dense(add_ac(res), self.hidsize, activation=None) return x + res for _ in range(4): x = residual(x) n_out_features = self.out_features.get_shape()[-1].value x = tf.layers.dense(add_ac(x), n_out_features, activation=None) x = unflatten_first_dim(x, sh) return x
def predict_next(self, reuse): nl = tf.nn.leaky_relu if isinstance(self.ac_space, gym.spaces.Discrete): ac = tf.one_hot(self.ac, get_action_n(self.ac_space), axis=2) else: ac = self.ac sh = tf.shape(ac) ac = flatten_two_dims(ac) ac_four_dim = tf.expand_dims(tf.expand_dims(ac, 1), 1) def add_ac(x): if x.get_shape().ndims == 2: return tf.concat([x, ac], axis=-1) elif x.get_shape().ndims == 4: sh = tf.shape(x) return tf.concat([ x, ac_four_dim + tf.zeros([ sh[0], sh[1], sh[2], ac_four_dim.get_shape()[3].value ], tf.float32) ], axis=-1) with tf.variable_scope(self.scope, reuse=reuse): x = flatten_two_dims(self.features) x = unet(x, nl=nl, feat_dim=self.feat_dim, cond=add_ac) x = unflatten_first_dim(x, sh) self.prediction_pixels = x * self.ob_std + self.ob_mean # return tf.reduce_mean((x - tf.stop_gradient(self.out_features)) ** 2, [2, 3, 4]) return x
def get_loss(self): ac = tf.one_hot(self.ac, self.ac_space.n, axis=2) sh = tf.shape(ac) ac = flatten_two_dims(ac) def add_ac(x): return tf.concat([x, ac], axis=-1) with tf.variable_scope(self.scope): x = flatten_two_dims(self.features) x = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu) def residual(x): res = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu) res = tf.layers.dense(add_ac(res), self.hidsize, activation=None) return x + res for _ in range(4): x = residual(x) n_out_features = self.out_features.get_shape()[-1].value x = tf.layers.dense(add_ac(x), n_out_features, activation=None) x = unflatten_first_dim(x, sh) return tf.reduce_mean((x - tf.stop_gradient(self.out_features))**2, -1)
def get_loss(self): nl = tf.nn.leaky_relu ac = tf.one_hot(self.ac, self.ac_space.n, axis=2) sh = tf.shape(ac) ac = flatten_two_dims(ac) ac_four_dim = tf.expand_dims(tf.expand_dims(ac, 1), 1) def add_ac(x): if x.get_shape().ndims == 2: return tf.concat([x, ac], axis=-1) elif x.get_shape().ndims == 4: sh = tf.shape(x) return tf.concat([ x, ac_four_dim + tf.zeros([ sh[0], sh[1], sh[2], ac_four_dim.get_shape()[3].value ], tf.float32) ], axis=-1) with tf.variable_scope(self.scope): x = flatten_two_dims(self.features) x = unet(x, nl=nl, feat_dim=self.feat_dim, cond=add_ac) x = unflatten_first_dim(x, sh) self.prediction_pixels = x * self.ob_std + self.ob_mean return tf.reduce_mean((x - tf.stop_gradient(self.out_features))**2, [2, 3, 4])
def get_loss(self, ac): sh = tf.shape(ac) ac = flatten_two_dims(ac) def add_ac(x): return tf.concat([x, ac], axis=-1) with tf.variable_scope(self.scope): x = flatten_two_dims(self.features) x = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu, reuse=tf.AUTO_REUSE) def residual(x): res = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu, reuse=tf.AUTO_REUSE) res = tf.layers.dense(add_ac(res), self.hidsize, activation=None, reuse=tf.AUTO_REUSE) return x + res for _ in range(4): x = residual(x) n_out_features = self.out_features.get_shape()[-1].value x = tf.layers.dense(add_ac(x), n_out_features, activation=None, reuse=tf.AUTO_REUSE) x = unflatten_first_dim(x, sh) return x
def decoder(self, z): nl = tf.nn.leaky_relu z_has_timesteps = (z.get_shape().ndims == 3) if z_has_timesteps: sh = tf.shape(z) z = flatten_two_dims(z) with tf.variable_scope(self.scope + "decoder"): z = small_deconvnet(z, nl=nl, ch=4 if self.spherical_obs else 8, positional_bias=True) if z_has_timesteps: z = unflatten_first_dim(z, sh) if self.spherical_obs: scale = tf.get_variable(name="scale", shape=(), dtype=tf.float32, initializer=tf.ones_initializer()) scale = tf.maximum(scale, -4.) scale = tf.nn.softplus(scale) scale = scale * tf.ones_like(z) else: z, scale = tf.split(z, 2, -1) scale = tf.nn.softplus(scale) # scale = tf.Print(scale, [scale]) return tf.distributions.Normal(loc=z, scale=scale)
def get_last_features(self, x, reuse): x_has_timesteps = (x.get_shape().ndims == 5) if x_has_timesteps: sh = tf.shape(x) x = flatten_two_dims(x) #with tf.variable_scope(self.scope + "_features", reuse=reuse): with tf.variable_scope(self.scope+"_features", reuse=reuse): x = (tf.to_float(x) - self.ob_mean) / self.ob_std x = small_convnet(x, nl=self.nl, feat_dim=self.feat_dim, last_nl=None, layernormalize=self.layernormalize) if x_has_timesteps: x = unflatten_first_dim(x, sh) x = tf.reshape(x, [-1, sh[1], self.feat_dim]) with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): init_1 = tf.contrib.rnn.LSTMStateTuple(self.last_c_in_1, self.last_h_in_1) if self.lstm2_size: init_2 = tf.contrib.rnn.LSTMStateTuple(self.last_c_in_2, self.last_h_in_2) if self.aux_input: prev_rews = tf.expand_dims(self.ph_last_rew, -1) x = tf.concat([x, prev_rews], -1) x, c_out_1, h_out_1 = lstm(self.lstm1_size)(x, initial_state=init_1) if self.lstm2_size: if self.aux_input: prev_acs = tf.one_hot(self.ph_last_ac, depth=self.num_actions) x = tf.concat([x, tf.cast(prev_acs, tf.float32)], -1) x = tf.concat([x, self.ph_last_vel], -1) x, c_out_2, h_out_2 = lstm(self.lstm2_size)(x, initial_state=init_2) return x
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, n_env, n_steps, reuse, n_lstm=256, scope="policy"): super(RnnPolicy, self).__init__(ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, n_env, n_steps, reuse, n_lstm, scope) with tf.variable_scope(scope, reuse=self.reuse): ## Use features x = self.flat_features input_sequence = batch_to_seq(x, self.n_env, self.n_steps) masks = batch_to_seq(self.masks_ph, self.n_env, self.n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=False) rnn_output = seq_to_batch(rnn_output) layernorm(rnn_output) ## Concat q = self.flat_features q = tf.concat([q, rnn_output], axis=1) q = fc(q, units=hidsize, activation=activ, name="fc1") q = fc(q, units=hidsize, activation=activ, name="fc2") pdparam, vpred = self.get_pdparam(q) self.pdparam = pdparam = unflatten_first_dim(pdparam, self.sh) self.vpred = unflatten_first_dim(vpred, self.sh)[:, :, 0] self.pd = pd = self.ac_pdtype.proba_distribution_from_flat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp)
def set_dynamics(self, dynamics): self.dynamics = dynamics with tf.variable_scope(self.scope): shaped = tf.shape(self.ph_ob) flat = flatten_two_dims(self.ph_ob) features = self.dynamics.auxiliary_task.get_features(flat, reuse=tf.AUTO_REUSE) pdparam = self.get_pdparam(features, False) pdparam = unflatten_first_dim(pdparam, shaped) self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp) '''Alternate ac for forward dynamics''' pdparam_alt = self.get_pdparam(self.extracted_features, True) pdparam_alt = unflatten_first_dim(pdparam_alt, shaped) self.a_samp_alt = self.ac_pdtype.pdfromflat(pdparam_alt).sample()
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, scope="policy"): if layernormalize: print("Warning: policy is operating on top of layer-normed features. It might slow down the training.") self.layernormalize = layernormalize self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std ''' Defining variables that'll be initialized with dynamics ''' self.dynamics = None self.a_samp = None self.entropy = None self.nlp_samp = None self.a_samp_alt = None with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype(ac_space) self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(None, None) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac') self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope self.pdparamsize = self.ac_pdtype.param_shape()[0] sh = tf.shape(self.ph_ob) x = flatten_two_dims(self.ph_ob) self.flat_features = self.get_features(x, reuse=False) self.features = unflatten_first_dim(self.flat_features, sh) self.extracted_features = tf.placeholder(dtype=tf.float32, shape=self.flat_features.shape) with tf.variable_scope(scope, reuse=False): x = fc(self.flat_features, units=hidsize, activation=activ) x = fc(x, units=hidsize, activation=activ) vpred = fc(x, name='value_function_output', units=1, activation=None) y = fc(vpred, units=hidsize, activation=activ) y = fc(y, units=hidsize, activation=activ) self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, n_env, n_steps, reuse, n_lstm=256, scope="policy"): if layernormalize: print( "Warning: policy is operating on top of layer-normed features. It might slow down the training." ) self.layernormalize = layernormalize self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std self.n_env = n_env self.n_steps = n_steps self.n_batch = n_env * n_steps self.n_lstm = n_lstm self.reuse = reuse with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space # self.ac_pdtype = make_pdtype(ac_space) self.ac_pdtype = make_proba_dist_type(ac_space) self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(self.n_env, self.n_steps) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder( [self.n_env, self.n_steps], name='ac') self.masks_ph = tf.placeholder(tf.float32, [self.n_env, self.n_steps], name="masks_ph") # mask (done t-1) self.flat_masks_ph = tf.reshape(self.masks_ph, [self.n_env * self.n_steps]) self.states_ph = tf.placeholder(tf.float32, [self.n_env, n_lstm * 2], name="states_ph") # states self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope self.pdparamsize = self.ac_pdtype.param_shape()[0] self.sh = tf.shape(self.ph_ob) x = flatten_two_dims(self.ph_ob) self.flat_features = self.get_features(x, reuse=self.reuse) self.features = unflatten_first_dim(self.flat_features, self.sh)
def get_loss(self): with tf.variable_scope(self.scope): x = tf.concat([self.features, self.next_features], 2) sh = tf.shape(x) x = flatten_two_dims(x) x = fc(x, units=self.policy.hidsize, activation=activ) x = fc(x, units=self.ac_space.n, activation=None) param = unflatten_first_dim(x, sh) idfpd = self.policy.ac_pdtype.pdfromflat(param) return idfpd.neglogp(self.ac)
def get_loss(self): ac = self.ac sh = ac.shape ac = flatten_dims(ac, len(self.ac_space.shape)) ac = torch.zeros(ac.shape + (self.ac_space.n, )).scatter_( 1, torch.tensor(ac).unsqueeze(1), 1) # one_hot(self.ac, self.ac_space.n, axis=2) ac = unflatten_first_dim(ac, sh) features = self.features next_features = self.next_features assert features.shape[:-1] == ac.shape[:-1] sh = features.shape x = flatten_dims(features, 1) ac = flatten_dims(ac, 1) x = self.loss_net(x, ac) x = unflatten_first_dim(x, sh) return torch.mean((x - next_features)**2, -1)
def forward_predictor(x): x = tf.layers.dense(add_ac(x), self.hidsize, activation=tf.nn.leaky_relu) for _ in range(4): x = residual(x) n_out_features = self.out_features.get_shape()[-1].value x = tf.layers.dense(add_ac(x), n_out_features, activation=None) x = unflatten_first_dim(x, sh) return x
def get_features(self, x): x_has_timesteps = (x.get_shape().ndims == 5) if x_has_timesteps: sh = x.shape x = flatten_dims(x, self.ob_space.n) x = np.transpose(x, [i for i in range(len(x.shape) - 3)] + [-1, -3, -2]) x = (x - self.ob_mean) / self.ob_std x = self.features_model(x) if x_has_timesteps: x = unflatten_first_dim(x, sh) return x
def get_features(self, x, reuse): nl = tf.nn.leaky_relu x_has_timesteps = (x.get_shape().ndims == 5) if x_has_timesteps: sh = tf.shape(x) x = flatten_two_dims(x) with tf.variable_scope(self.scope + "_features", reuse=reuse): x = (tf.to_float(x) - self.ob_mean) / self.ob_std x = small_convnet(x, nl=nl, feat_dim=self.feat_dim, last_nl=nl, layernormalize=False) if x_has_timesteps: x = unflatten_first_dim(x, sh) return x
def get_features(self, x): x_has_timesteps = (len(x.shape) == 5) if x_has_timesteps: sh = x.shape x = flatten_dims(x, len(self.ob_space.shape)) x = (x - self.ob_mean) / self.ob_std x = np.transpose(x, [i for i in range(len(x.shape) - 3)] + [-1, -3, -2]) # transpose channel axis x = self.features_model(torch.tensor(x)) if x_has_timesteps: x = unflatten_first_dim(x, sh) return x
def get_loss(self, reuse=False): with tf.variable_scope(self.scope, reuse=reuse): x = tf.concat([self.features, self.next_features], 2) sh = tf.shape(x) x = flatten_two_dims(x) x = fc(x, units=self.policy.hidsize, activation=activ) # x = fc(x, units=self.ac_space.n, activation=None) x = fc(x, units=get_action_n(self.ac_space), activation=None) param = unflatten_first_dim(x, sh) # idfpd = self.policy.ac_pdtype.pdfromflat(param) idfpd = self.policy.ac_pdtype.proba_distribution_from_flat(param) return idfpd.neglogp(self.ac)
def get_features(self, x, reuse): x_has_timesteps = (x.get_shape().ndims == 5) if x_has_timesteps: sh = tf.shape(x) x = flatten_two_dims(x) with tf.variable_scope(self.scope + "_features", reuse=reuse): x = tf.to_float(x) x = small_convnet(x, nl=self.nl, feat_dim=self.feat_dim, last_nl=None, layernormalize=self.layernormalize) if x_has_timesteps: x = unflatten_first_dim(x, sh) return x
def get_features(self, x, reuse): if (x.get_shape().ndims == 5): shape = tf.shape(x) x = flatten_two_dims(x) with tf.variable_scope(self.scope + '_features', reuse=reuse): x = (tf.cast(x, tf.float32) - self.ob_mean) / self.ob_std x = small_convnet(x, nl=self.nl, feat_dim=self.feat_dim, last_nl=None, layernormalize=self.layernormalize) if (x.get_shape().ndims == 5): x = unflatten_first_dim(x, shape) return x
def get_features(self, x): x_has_timesteps = (len(x.shape) == 5) if x_has_timesteps: sh = torch.shape(x) x = flatten_two_dims(x) x = (x - self.ob_mean) / self.ob_std x = np.transpose(x, [i for i in range(len(x.shape) - 3)] + [-1, -3, -2]) # [N, H, W, C] --> [N, C, H, W] x = self.features_model(torch.tensor(x)) if x_has_timesteps: x = unflatten_first_dim(x, sh) return x
def update_features(self, ob, ac): sh = ob.shape # ob.shape = [nenvs, timestep, H, W, C]. Can timestep > 1 ? x = flatten_dims( ob, len(self.ob_space.shape) ) # flat first two dims of ob.shape and get a shape of [N, H, W, C]. flat_features = self.get_features(x) # [N, feat_dim] self.flat_features = flat_features hidden = self.pd_hidden(flat_features) pdparam = self.pd_head(hidden) vpred = self.vf_head(hidden) self.vpred = unflatten_first_dim(vpred, sh) #[nenvs, tiemstep, v] self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.ac = ac self.ob = ob
def decoder(self, z): z_has_timesteps = (len(z.shape) == 3) if z_has_timesteps: sh = z.shape z = flatten_dims(z, 1) z = self.decoder_model(z) if z_has_timesteps: z = unflatten_first_dim(z, sh) if self.spherical_obs: scale = torch.max(self.scale, torch.tensor(-4.0)) scale = torch.nn.functional.softplus(scale) scale = scale * torch.ones(z.shape) else: z, scale = torch.split(z, [4, 4], -3) scale = torch.nn.functional.softplus(scale) return torch.distributions.normal.Normal(z, scale)
def get_loss(self): # 构造逆环境模型, 流程 输入 [feature(obs), feature(obs_next)] -> 输出动作参数 # 计算不同动作的高斯或者softmax分布 -> 计算 log_prob 作为 inverse dynamics 的损失. with tf.variable_scope(self.scope): # features.shape=(None,None,512), next_features.shape=(None,None,512), x = tf.concat([self.features, self.next_features], 2) # x.shape=(None,None,1024) sh = tf.shape(x) x = flatten_two_dims(x) # (None, 1024) 融合了 feature 和 next_feature x = fc(x, units=self.policy.hidsize, activation=activ) # (None,512) x = fc(x, units=self.ac_space.n, activation=None) # (None,4) 输出动作logits param = unflatten_first_dim(x, sh) # (None,None,4) 恢复维度 idfpd = self.policy.ac_pdtype.pdfromflat(param) # 根据输出 logits 建立分布 # 如果是连续动作空间,这里代表高斯-log损失; 如果是离散动作空间, 这里代表 softmax 损失 return idfpd.neglogp(self.ac) # shape等于前2个维度 (None,None)
def get_loss(self): sh = tf.shape(self.features) with tf.variable_scope(self.scope): x = flatten_two_dims(self.features) x = tf.layers.dense(x, self.hidsize, activation=tf.nn.relu) x = tf.layers.dense(x, self.hidsize, activation=tf.nn.relu) # def residual(x): # res = tf.layers.dense(x, self.hidsize, activation=tf.nn.relu) # res = tf.layers.dense(x, self.hidsize, activation=None) # return x + res # for _ in range(4): # x = residual(x) n_out_features = self.out_features.get_shape()[-1].value x = tf.layers.dense(x, n_out_features, activation=None) x = unflatten_first_dim(x, sh) return tf.reduce_mean((x - tf.stop_gradient(self.out_features))**2, -1)