class CnnPolicy(StochasticPolicy): def __init__( self, scope, ob_space, ac_space, policy_size="normal", extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1.0, dynamics_bonus=False, meta_rl=False, ): StochasticPolicy.__init__(self, scope, ob_space, ac_space, meta_rl=meta_rl) self.proportion_of_exp_used_for_predictor_update = ( proportion_of_exp_used_for_predictor_update) enlargement = {"small": 1, "normal": 2, "large": 4}[policy_size] rep_size = 512 self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obstd") memsize *= enlargement hidsize *= enlargement convfeat = 16 * enlargement self.ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu, ) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name="state") pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize # Inputs to policy and value function will have different shapes depending on whether it is rollout # or optimization time, so we treat separately. ( self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt, ) = self.apply_policy( self.ph_ob['obs'][:, :-1], reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize, additional_inputs=self.ph_ob, ) ( self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout, ) = self.apply_policy( self.ph_ob['obs'], reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize, additional_inputs=self.ph_ob, ) if dynamics_bonus: self.define_dynamics_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) else: self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate @staticmethod def apply_policy( ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, additional_inputs=None, ): meta_rl = False data_format = "NHWC" ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info( f"CnnPolicy: using '{ph.name}' shape {ph.shape} as image input") X = tf.cast(ph, tf.float32) / 255.0 X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope( scope, reuse=reuse), tf.device("/gpu:0" if yes_gpu else "/cpu:0"): X = activ( conv( X, "c1", nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format, )) X = activ( conv( X, "c2", nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format, )) X = activ( conv( X, "c3", nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format, )) X = to2d(X) mix_other_observations = [X] if ('prev_acs' in additional_inputs) and ('prev_rew' in additional_inputs): # Cast numpy arrays to tf tensors prev_acs = tf.cast(additional_inputs['prev_acs'], tf.float32) prev_rew = tf.cast(additional_inputs['prev_rew'], tf.float32) # Flatten out time dimension prev_acs = tf.reshape(prev_acs, (-1, *prev_acs.shape.as_list()[2:])) prev_rew = tf.reshape(prev_rew, (-1, *prev_rew.shape.as_list()[2:])) # Add to 2D features going to FC layers mix_other_observations.extend([prev_acs, prev_rew]) X = tf.concat(mix_other_observations, axis=1) X = activ(fc(X, "fc1", nh=hidsize, init_scale=np.sqrt(2))) additional_size = 448 X = activ( fc(X, "fc_additional", nh=additional_size, init_scale=np.sqrt(2))) snext = tf.zeros((sy_nenvs, memsize)) mix_timeout = [X] Xtout = tf.concat(mix_timeout, axis=1) if extrahid: Xtout = X + activ( fc(Xtout, "fc2val", nh=additional_size, init_scale=0.1)) X = X + activ( fc(X, "fc2act", nh=additional_size, init_scale=0.1)) pdparam = fc(X, "pd", nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, "vf_int", nh=1, init_scale=0.01) vpred_ext = fc(Xtout, "vf_ext", nh=1, init_scale=0.01) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext def define_self_prediction_rew(self, convfeat, rep_size, enlargement): logger.info( "Using RND BONUS ****************************************************" ) # RND bonus. # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info( f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input" ) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu( conv( xr, "c1r", nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2), )) xr = tf.nn.leaky_relu( conv( xr, "c2r", nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2), )) xr = tf.nn.leaky_relu( conv( xr, "c3r", nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2), )) rgbr = [to2d(xr)] X_r = fc(rgbr[0], "fc1r", nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info( f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input" ) xrp = ph[:, 1:] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu( conv( xrp, "c1rp_pred", nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2), )) xrp = tf.nn.leaky_relu( conv( xrp, "c2rp_pred", nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2), )) xrp = tf.nn.leaky_relu( conv( xrp, "c3rp_pred", nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2), )) rgbrp = to2d(xrp) X_r_hat = tf.nn.relu( fc( rgbrp, "fc1r_hat1_pred", nh=256 * enlargement, init_scale=np.sqrt(2), )) X_r_hat = tf.nn.relu( fc( X_r_hat, "fc1r_hat2_pred", nh=256 * enlargement, init_scale=np.sqrt(2), )) X_r_hat = fc(X_r_hat, "fc1r_hat3_pred", nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) targets = tf.stop_gradient(X_r) self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0.0, maxval=1.0, dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.0) def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement): # Dynamics loss with random features. # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info( f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input" ) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu( conv( xr, "c1r", nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2), )) xr = tf.nn.leaky_relu( conv( xr, "c2r", nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2), )) xr = tf.nn.leaky_relu( conv( xr, "c3r", nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2), )) rgbr = [to2d(xr)] X_r = fc(rgbr[0], "fc1r", nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2) assert ac_one_hot.get_shape().ndims == 3 assert ac_one_hot.get_shape().as_list() == [ None, None, self.ac_space.n, ], ac_one_hot.get_shape().as_list() ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n)) def cond(x): return tf.concat([x, ac_one_hot], 1) for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info( f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input" ) xrp = ph[:, :-1] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:])) # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok? xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu( conv( xrp, "c1rp_pred", nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2), )) xrp = tf.nn.leaky_relu( conv( xrp, "c2rp_pred", nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2), )) xrp = tf.nn.leaky_relu( conv( xrp, "c3rp_pred", nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2), )) rgbrp = to2d(xrp) X_r_hat = tf.nn.relu( fc( cond(rgbrp), "fc1r_hat1_pred", nh=256 * enlargement, init_scale=np.sqrt(2), )) X_r_hat = tf.nn.relu( fc( cond(X_r_hat), "fc1r_hat2_pred", nh=256 * enlargement, init_scale=np.sqrt(2), )) X_r_hat = fc(cond(X_r_hat), "fc1r_hat3_pred", nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) noisy_targets = tf.stop_gradient(X_r) self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0.0, maxval=1.0, dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.0) def initial_state(self, n): return np.zeros((n, self.memsize), np.float32) def call(self, dict_obs, new, istate, update_obs_stats=False): for ob in dict_obs.values(): if (ob is not None) and update_obs_stats: raise NotImplementedError ob = ob.astype(np.float32) ob = ob.reshape(-1, *self.ob_space.shape) self.ob_rms.update(ob) # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk. # It will use whatever observation spaces saved to disk along with other ctor params. feed1 = { self.ph_ob[k]: dict_obs[k] for k in self.ph_ob_keys if k != 'obs' } feed1.update({ self.ph_mean: self.ob_rms.mean, self.ph_std: self.ob_rms.var**0.5 }) # Add an extra empty dimension to the primary observation if needed if len(dict_obs['obs'].shape) == 4: feed1[self.ph_ob['obs']] = dict_obs['obs'][:, None] else: feed1[self.ph_ob['obs']] = dict_obs['obs'] feed2 = { self.ph_istate: istate, self.ph_new: new[:, None].astype(np.float32) } a, vpred_int, vpred_ext, nlp, newstate, ent = tf.get_default_session( ).run( [ self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout, self.nlp_samp, self.snext_rollout, self.entropy_rollout, ], feed_dict={ **feed1, **feed2 }, ) return a[:, 0], vpred_int[:, 0], vpred_ext[:, 0], nlp[:, 0], newstate, ent[:, 0]
class CnnPolicy(StochasticPolicy): def __init__(self, scope, ob_space, ac_space, policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., exploration_type='bottleneck', beta=0.001, rew_counter=None ): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update enlargement = { 'small': 1, 'normal': 2, 'large': 4 }[policy_size] rep_size = 512 self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obmean") # (84, 84, 1) self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obstd") # (84, 84, 1) memsize *= enlargement # memsize = 256 hidsize *= enlargement # hidsize = 256 convfeat = 16*enlargement # covfeat = 32 self.ob_rms = RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32,shape=(None, memsize), name='state') # (None,256) pdparamsize = self.pdtype.param_shape()[0] # 18 等于动作维度 self.memsize = memsize # Inputs to policy and value function will have different shapes depending on whether it is rollout or optimization time, so we treat separately. # pdparam_opt.shape=(None, None, 18), vpred_int_opt.shape=(None, None), vpred_ext_opt.shape=(None, None), snext_opt.shape=(None, 256) self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_policy(self.ph_ob[None][:,:-1], reuse=False, scope=scope, hidsize=hidsize, # 256 memsize=memsize, # 256 extrahid=extrahid, # True sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize) # 18 self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_policy(self.ph_ob[None], reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize) self.exploration_type = exploration_type self.max_table = 0 self.define_bottleneck_rew(convfeat=convfeat, rep_size=rep_size/8, enlargement=enlargement, beta=beta, rew_counter=rew_counter) pd = self.pdtype.pdfromflat(self.pdparam_rollout) # 输出策略 softmax 的分布. self.a_samp = pd.sample() # 采样动作 self.nlp_samp = pd.neglogp(self.a_samp) # 输出动作 self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.a_samp_opt = self.pd_opt.sample() self.ph_istate = ph_istate self.scope = scope ############################################# ########## 以下过程实际并未使用 ################ ############################################# # for gradcam policy a_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2) # (None,None) -> (None,None,18) # 相当于取出 one_hot 执行的动作的位置的 pdparam_opt loss_cam_pol = tf.reduce_mean(tf.multiply(self.pdparam_opt, a_one_hot)) # (None,) self.conv_out = tf.get_default_graph().get_tensor_by_name('ppo/pol/Relu_2:0') self.grads = tf.gradients(loss_cam_pol, self.conv_out)[0] # for gradcam aux loss_cam_aux = self.kl if int(str(tf.__version__).split('.')[1]) < 10: self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2/Maximum:0') else: self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2:0') self.grads_aux = tf.abs(tf.gradients(loss_cam_aux, self.conv_aux_out)[0]) # self.cams 实际并未使用 weights = tf.reduce_mean(tf.reduce_mean(self.grads, 2), 1) weights = tf.expand_dims(tf.expand_dims(weights, axis=1), axis=1) weights = tf.tile(weights, [1, 6, 6, 1]) cams = tf.reduce_sum((weights * self.conv_out), axis=3) self.cams = tf.maximum(cams, tf.zeros_like(cams)) # self.cans_aux 实际并未使用 weights_aux = tf.reduce_mean(tf.reduce_mean(self.grads_aux, 2), 1) weights_aux = tf.expand_dims(tf.expand_dims(weights_aux, axis=1), axis=1) weights_aux = tf.tile(weights_aux, [1, 7, 7, 1]) cams_aux = tf.nn.relu(tf.reduce_sum((weights_aux * self.conv_aux_out), axis=3)) self.cams_aux = tf.maximum(cams_aux, tf.zeros_like(cams_aux)) @staticmethod def apply_policy(ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize): data_format = 'NHWC' ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info("CnnPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) / 255. X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) # shape=(None, 84, 84, 4) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope(scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): # shape: (None, 84, 84, 4) -> (None, 20, 20, 32) X = activ(conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format)) # shape: (None, 20, 20, 32) -> (None, 9, 9, 64) X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format)) # shape: (None, 9, 9, 64) -> (None, 6, 6, 64) X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format)) # (None, 6, 6, 64) -> (None, 2304) X = to2d(X) mix_other_observations = [X] X = tf.concat(mix_other_observations, axis=1) # (None, 2304) X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2))) # (None, 256) additional_size = 448 X = activ(fc(X, 'fc_additional', nh=additional_size, init_scale=np.sqrt(2))) # (None, 448) snext = tf.zeros((sy_nenvs, memsize)) # (None, 256) mix_timeout = [X] Xtout = tf.concat(mix_timeout, axis=1) # (None, 448) if extrahid: # True Xtout = X + activ(fc(Xtout, 'fc2val', nh=additional_size, init_scale=0.1)) # (None, 448) X = X + activ(fc(X, 'fc2act', nh=additional_size, init_scale=0.1)) # (None, 448) pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01) # (None, 18) vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01) # (None, 1) vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01) # (None, 1) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) # shape=(None, None, 18) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) # shape=(None, None) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) # shape=(None, None) return pdparam, vpred_int, vpred_ext, snext def define_bottleneck_rew(self, convfeat, rep_size, enlargement, beta=1e-2, rew_counter=None): # convfeat=32, rep_size=64, enlargement=2, beta=0.001, rew_counter=None logger.info("Using Curiosity Bottleneck ****************************************************") v_target = tf.reshape(self.ph_ret_ext, (-1, 1)) if rew_counter is None: sched_coef = 1. else: sched_coef = tf.minimum(rew_counter/1000, 1.) # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C. ph.shape=(None,None,84,84,4) logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:,1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] # (None, 84, 84, 1) xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) # (None, 84, 84, 1) xr = tf.nn.leaky_relu(conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) # (None, 20, 20, 32) xr = tf.nn.leaky_relu(conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) # (None, 9, 9, 64) xr = tf.nn.leaky_relu(conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) # (None, 7, 7, 64) rgbr = [to2d(xr)] # (None, 3136) mu = fc(rgbr[0], 'fc_mu', nh=rep_size, init_scale=np.sqrt(2)) # (None, 64) sigma = tf.nn.softplus(fc(rgbr[0], 'fc_sigma', nh=rep_size, init_scale=np.sqrt(2))) # (None, 64) z = mu + sigma * tf.random_normal(tf.shape(mu), 0, 1, dtype=tf.float32) # (None, 64) v = fc(z, 'value', nh=1, init_scale=np.sqrt(2)) # (None, 64) self.feat_var = tf.reduce_mean(sigma) self.max_feat = tf.reduce_max(tf.abs(z)) self.kl = 0.5 * tf.reduce_sum( tf.square(mu) + tf.square(sigma) - tf.log(1e-8 + tf.square(sigma)) - 1, axis=-1, keep_dims=True) self.int_rew = tf.stop_gradient(self.kl) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) self.aux_loss_raw = sched_coef * tf.square(v_target - v) + beta * self.kl # self.aux_loss_raw = beta * self.kl self.aux_loss = sched_coef * tf.square(v_target - v) + beta * self.kl # (None, 1) # mask 是 0-1 之间的随机数 mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) # (None, 1) # 全为 true 的矩阵. shape=(None,1) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) # (None, 1) # 对 aux_loss.shape=(None,1) 的每个位置取平均 self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(tf.reduce_sum(mask), 1.) # (None, ) self.v_int = v # (None,1) def initial_state(self, n): return np.zeros((n, self.memsize), np.float32) def call(self, dict_obs, new, istate, update_obs_stats=False): """ called when step() """ for ob in dict_obs.values(): if ob is not None: if update_obs_stats: raise NotImplementedError ob = ob.astype(np.float32) ob = ob.reshape(-1, *self.ob_space.shape) self.ob_rms.update(ob) # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk. # It will use whatever observation spaces saved to disk along with other ctor params. feed1 = { self.ph_ob[k]: dict_obs[k][:,None] for k in self.ph_ob_keys } feed2 = { self.ph_istate: istate, self.ph_new: new[:,None].astype(np.float32) } feed1.update({self.ph_mean: self.ob_rms.mean, self.ph_std: self.ob_rms.var ** 0.5}) a, vpred_int,vpred_ext, nlp, newstate, ent = tf.get_default_session().run( [self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout, self.nlp_samp, self.snext_rollout, self.entropy_rollout], feed_dict={**feed1, **feed2}) return a[:,0], vpred_int[:,0],vpred_ext[:,0], nlp[:,0], newstate, ent[:,0]
class CnnGruPolicy(StochasticPolicy): def __init__( self, scope, ob_space, ac_space, policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., dynamics_bonus=False, ): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size] rep_size = 512 self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obstd") memsize *= enlargement #256 hidsize *= enlargement #256 convfeat = 16 * enlargement self.ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name='state') pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_policy(self.ph_ob[None][:,:-1], ph_new=self.ph_new, ph_istate=ph_istate, reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_policy(self.ph_ob[None], ph_new=self.ph_new, ph_istate=ph_istate, reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) if dynamics_bonus: self.define_dynamics_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) else: self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) self.step_prediction(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate @staticmethod def apply_policy(ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init): data_format = 'NHWC' ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info("CnnGruPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) / 255. X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope( scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): X = activ( conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format)) X = activ( conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format)) X = activ( conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format)) X = to2d(X) X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2))) X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize]) X, snext = tf.nn.dynamic_rnn(GRUCell(memsize, rec_gate_init=rec_gate_init), (X, ph_new[:, :, None]), dtype=tf.float32, time_major=False, initial_state=ph_istate) X = tf.reshape(X, (-1, memsize)) Xtout = X if extrahid: Xtout = X + activ( fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1)) X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1)) pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01) vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext def define_self_prediction_rew(self, convfeat, rep_size, enlargement): #RND. # Random target network. print('self_predict') for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu( conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) rgbr = [to2d(xr)] X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:, 1:] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu( conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xrp) X_r_hat = tf.nn.relu( fc(rgbrp, 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(X_r_hat, 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = fc(X_r_hat, 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) noisy_targets = tf.stop_gradient(X_r) self.aux_loss = tf.reduce_mean( tf.square(noisy_targets + tf.sqrt(self.stepvalues / 512) - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.) def step_prediction(self, convfeat, rep_size, enlargement): #RND. # Random target network. print('step_predict') #for ph in self.ph_ob.values(): #if len(ph.shape.as_list()) == 5: # B,T,H,W,C #logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) #xr = ph[:,1:] #xr = tf.cast(xr, tf.float32) #xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] #xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) #xr = tf.nn.leaky_relu(conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) #xr = tf.nn.leaky_relu(conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) #xr = tf.nn.leaky_relu(conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) #rgbr = [to2d(xr)] #X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xstep = ph[:, 0:-1] xstep = tf.cast(xstep, tf.float32) xstep = tf.reshape(xstep, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xstep = tf.clip_by_value((xstep - self.ph_mean) / self.ph_std, -5.0, 5.0) xstep = tf.nn.leaky_relu( conv(xstep, 'c1step_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xstep = tf.nn.leaky_relu( conv(xstep, 'c2step_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xstep = tf.nn.leaky_relu( conv(xstep, 'c3step_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xstep) X_r_step = tf.nn.relu( fc(rgbrp, 'fc1_step_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_step = tf.nn.relu( fc(X_r_step, 'fc2_step_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_step = fc(X_r_step, 'fc3_step_pred', nh=rep_size, init_scale=np.sqrt(2)) #self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) #self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_step_rew = tf.reduce_mean(tf.square(X_r_step), axis=-1, keep_dims=True) self.int_step_rew = tf.reshape(self.int_step_rew, (self.sy_nenvs, self.sy_nsteps - 1)) #noisy_targets = tf.stop_gradient(X_r) self.step_loss = tf.reduce_mean( tf.square(tf.sqrt(self.stepvalues / 512) - X_r_step), -1) mask = tf.random_uniform(shape=tf.shape(self.step_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.step_loss = tf.reduce_sum(mask * self.step_loss) / tf.maximum( tf.reduce_sum(mask), 1.) def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement): #Dynamics based bonus. print('dynamics predict') # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu( conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) rgbr = [to2d(xr)] X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2) assert ac_one_hot.get_shape().ndims == 3 assert ac_one_hot.get_shape().as_list() == [ None, None, self.ac_space.n ], ac_one_hot.get_shape().as_list() ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n)) def cond(x): return tf.concat([x, ac_one_hot], 1) for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:, :-1] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:])) # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok? xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu( conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xrp) # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(cond(rgbrp), 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(cond(X_r_hat), 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = fc(cond(X_r_hat), 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) noisy_targets = tf.stop_gradient(X_r) self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.) def initial_state(self, n): return np.zeros((n, self.memsize), np.float32) def call(self, dict_obs, new, istate, update_obs_stats=False): for ob in dict_obs.values(): if ob is not None: if update_obs_stats: raise NotImplementedError ob = ob.astype(np.float32) ob = ob.reshape(-1, *self.ob_space.shape) self.ob_rms.update(ob) # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk. # It will use whatever observation spaces saved to disk along with other ctor params. feed1 = {self.ph_ob[k]: dict_obs[k][:, None] for k in self.ph_ob_keys} feed2 = { self.ph_istate: istate, self.ph_new: new[:, None].astype(np.float32) } feed1.update({ self.ph_mean: self.ob_rms.mean, self.ph_std: self.ob_rms.var**0.5 }) # for f in feed1: # print(f) a, vpred_int, vpred_ext, nlp, newstate, ent = tf.get_default_session( ).run([ self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout, self.nlp_samp, self.snext_rollout, self.entropy_rollout ], feed_dict={ **feed1, **feed2 }) return a[:, 0], vpred_int[:, 0], vpred_ext[:, 0], nlp[:, 0], newstate, ent[:, 0]
class PPO_RND(object): def __init__(self, ob_space, ac_space, policy_type, args): self.gamma = args.gamma self.lam = args.lam self.adam_epsilon = args.adam_epsilon self.clip_param = args.clip_param self.entcoeff = args.entcoeff self.optim_stepsize = args.optim_stepsize self.int_coeff = args.int_coeff self.ext_coeff = args.ext_coeff self.ob_space = ob_space self.ac_space = ac_space self.policy_type = policy_type if self.policy_type == "coord_cnn": self.pi = CoordConvPolicy("pi", self.ob_space, self.ac_space, args.hidden_size, args.num_hid_layers, args.kind) self.oldpi = CoordConvPolicy("oldpi", self.ob_space, self.ac_space, args.hidden_size, args.num_hid_layers, args.kind) self.int_rew = RND("rnd_int_rew", self.pi.ob, args) self.rff_int = RewardForwardFilter(args.gamma) self.rff_rms_int = RunningMeanStd(comm=MPI.COMM_SELF, use_mpi=True) self.build_graph() U.initialize() self.adam.sync() def build_graph(self): atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret_ext = tf.placeholder(dtype=tf.float32, shape=[None]) # Extrinsic return ret_int = tf.placeholder(dtype=tf.float32, shape=[None]) # Intrinsic return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = self.clip_param * lrmult # Annealed clipping parameter epsilon ob = self.pi.ob ac = self.pi.pdtype.sample_placeholder([None]) kloldnew = self.oldpi.pd.kl(self.pi.pd) ent = self.pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-self.entcoeff) * meanent ratio = tf.exp(self.pi.pd.logp(ac) - self.oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_ext_loss = tf.reduce_mean(tf.square(self.pi.vpred_ext - ret_ext)) vf_int_loss = tf.reduce_mean(tf.square(self.pi.vpred_int - ret_int)) vf_loss = vf_ext_loss + vf_int_loss total_loss = pol_surr + pol_entpen + vf_loss + self.int_rew.aux_loss self.losses = [ pol_surr, pol_entpen, vf_ext_loss, vf_int_loss, meankl, meanent, self.int_rew.aux_loss ] self.loss_names = [ "pol_surr", "pol_entpen", "vf_ext_loss", "vf_int_loss", "kl", "ent", "aux_loss" ] var_list = self.pi.get_trainable_variables( ) + self.int_rew.get_trainable_variables() self.lossandgrad = U.function( [ac, atarg, ret_ext, ret_int, lrmult] + ob, self.losses + [U.flatgrad(total_loss, var_list)]) self.compute_losses = U.function( [ac, atarg, ret_ext, ret_int, lrmult] + ob, self.losses) self.adam = MpiAdam(var_list, epsilon=self.adam_epsilon) self.assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( self.oldpi.get_variables(), self.pi.get_variables()) ]) def train(self, seg, optim_batchsize, optim_epochs): #normalize the reward rffs_int = np.array( [self.rff_int.update(rew) for rew in seg["rew_int"]]) self.rff_rms_int.update(rffs_int.ravel()) seg["rew_int"] = seg["rew_int"] / np.sqrt(self.rff_rms_int.var) cur_lrmult = 1.0 add_vtarg_and_adv(seg, self.gamma, self.lam) ob, unnorm_ac, atarg_ext, tdlamret_ext, atarg_int, tdlamret_int = seg[ "ob"], seg["unnorm_ac"], seg["adv_ext"], seg["tdlamret_ext"], seg[ "adv_int"], seg["tdlamret_int"] vpredbefore_ext, vpredbefore_int = seg["vpred_ext"], seg[ "vpred_int"] # predicted value function before udpate atarg_ext = (atarg_ext - atarg_ext.mean()) / atarg_ext.std( ) # standardized advantage function estimate atarg_int = (atarg_int - atarg_int.mean()) / atarg_int.std() atarg = self.int_coeff * atarg_int + self.ext_coeff * atarg_ext d = Dataset(dict(ob=ob, ac=unnorm_ac, atarg=atarg, vtarg_ext=tdlamret_ext, vtarg_int=tdlamret_int), shuffle=not self.pi.recurrent) if hasattr(self.pi, "ob_rms"): self.pi.update_obs_rms(ob) # update running mean/std for policy if hasattr(self.int_rew, "ob_rms"): self.int_rew.update_obs_rms( ob) #update running mean/std for int_rew self.assign_old_eq_new( ) # set old parameter values to new parameter values logger.log2("Optimizing...") logger.log2(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): lg = self.lossandgrad(batch["ac"], batch["atarg"], batch["vtarg_ext"], batch["vtarg_int"], cur_lrmult, *zip(*batch["ob"].tolist())) new_losses, g = lg[:-1], lg[-1] self.adam.update(g, self.optim_stepsize * cur_lrmult) losses.append(new_losses) logger.log2(fmt_row(13, np.mean(losses, axis=0))) logger.log2("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ac"], batch["atarg"], batch["vtarg_ext"], batch["vtarg_int"], cur_lrmult, *zip(*batch["ob"].tolist())) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log2(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, self.loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular( "ev_tdlam_ext_before", explained_variance(vpredbefore_ext, tdlamret_ext)) return meanlosses
class MlpPolicy(StochasticPolicy): def __init__( self, scope, ob_space, ac_space, policy_size='small', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., dynamics_bonus=False, ): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape), name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape), name="obstd") self.ob_rms = RunningMeanStd( shape=list(ob_space.shape), use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name='state') pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size] rep_size = 16 memsize *= enlargement hidsize *= enlargement convfeat = 16 * enlargement #Inputs to policy and value function will have different shapes depending on whether it is rollout #or optimization time, so we treat separately. self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_policy(self.ph_ob[None][:,:-1], reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize ) self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_policy(self.ph_ob[None], reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize ) if dynamics_bonus: self.define_dynamics_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) else: self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate @staticmethod def apply_policy(ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, use_action_balance=None): ph = ph_ob assert len(ph.shape.as_list()) == 3 # B,T,S logger.info("Mlp Policy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) X = tf.reshape(X, (-1, *ph.shape.as_list()[-1:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope( scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): X = activ(fc(X, 'fc_0', nh=hidsize, init_scale=np.sqrt(2))) mix_other_observations = [X] X = tf.concat(mix_other_observations, axis=1) X = activ(fc(X, 'fc_1', nh=hidsize, init_scale=np.sqrt(2))) additional_size = 64 X = activ( fc(X, 'fc_additional', nh=additional_size, init_scale=np.sqrt(2))) snext = tf.zeros((sy_nenvs, memsize)) mix_timeout = [X] Xtout = tf.concat(mix_timeout, axis=1) if extrahid: Xtout = X + activ( fc(Xtout, 'fc2val', nh=additional_size, init_scale=0.1)) X = X + activ( fc(X, 'fc2act', nh=additional_size, init_scale=0.1)) pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01) vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01) # if use_action_balance: pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext def define_action_balance_rew(self, units, rep_size): logger.info( "Using Action Balance BONUS ****************************************************" ) # (s, a) seen frequency as bonus with tf.variable_scope('action_balance', reuse=tf.AUTO_REUSE): ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2) assert ac_one_hot.get_shape().ndims == 3 assert ac_one_hot.get_shape().as_list() == [ None, None, self.ac_space.n ], ac_one_hot.get_shape().as_list() ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n)) def cond(x): return tf.concat([x, ac_one_hot], 1) # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 3: # B,T,S logger.info( "Mlp Target: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:, :-1] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-1:])) xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.relu( fc(cond(xr), 'fc_sa0_r', nh=units, init_scale=np.sqrt(2))) xr = tf.nn.relu( fc(cond(xr), 'fc_sa1_r', nh=units, init_scale=np.sqrt(2))) X_r = fc(cond(xr), 'fc_sa2_r', nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 3: # B,T,S logger.info( "Mlp Target: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:, :-1] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-1:])) xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.relu( fc(cond(xrp), 'fc_sa0_r', nh=units * 2, init_scale=np.sqrt(2))) xrp = tf.nn.relu( fc(cond(xrp), 'fc_sa1_r', nh=units * 2, init_scale=np.sqrt(2))) X_r_hat = fc(cond(xrp), 'fc_sa2_r', nh=rep_size, init_scale=np.sqrt(2)) self.feat_var_ab = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat_ab = tf.reduce_max(tf.abs(X_r)) self.int_rew_ab = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew_ab = tf.reshape(self.int_rew_ab, (self.sy_nenvs, self.sy_nsteps - 1)) noisy_targets = tf.stop_gradient(X_r) # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat)) self.aux_loss_ab = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss_ab), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss_ab = tf.reduce_sum(mask * self.aux_loss_ab) / tf.maximum( tf.reduce_sum(mask), 1.) def define_self_prediction_rew(self, convfeat, rep_size, enlargement): logger.info( "Using RND BONUS ****************************************************" ) hidden_size = convfeat * 2 #RND bonus. activ = tf.nn.relu # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 3: # B,T,S logger.info("Mlp Target: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:, 1:] # get next status index is 1: xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-1:])) xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = activ( fc(xr, 'fc_0_r', nh=hidden_size, init_scale=np.sqrt(2))) xr = activ( fc(xr, 'fc_1_r', nh=hidden_size, init_scale=np.sqrt(2))) X_r = fc(xr, 'fc_2_r', nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 3: # B,T,S logger.info("Mlp Target: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:, 1:] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-1:])) xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = activ( fc(xrp, 'fc_0_pred', nh=hidden_size, init_scale=np.sqrt(2))) xrp = activ( fc(xrp, 'fc_1_pred', nh=hidden_size, init_scale=np.sqrt(2))) X_r_hat = fc(xrp, 'fc_2_pred', nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) targets = tf.stop_gradient(X_r) # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat)) self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.) def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement): #Dynamics loss with random features. activ = tf.nn.relu # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 3: # B,T,S logger.info("Mlp Target: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:, 1:] # get next status index is 1: xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-1:])) xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = activ(fc(xr, 'fc_0_r', nh=32, init_scale=np.sqrt(2))) xr = activ(fc(xr, 'fc_1_r', nh=32, init_scale=np.sqrt(2))) X_r = fc(xr, 'fc_2_r', nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2) assert ac_one_hot.get_shape().ndims == 3 assert ac_one_hot.get_shape().as_list() == [ None, None, self.ac_space.n ], ac_one_hot.get_shape().as_list() ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n)) def cond(x): return tf.concat([x, ac_one_hot], 1) for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 3: # B,T,S logger.info("Mlp Target: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:, 1:] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-1:])) xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = activ(fc(xrp, 'fc_0_pred', nh=32, init_scale=np.sqrt(2))) xrp = activ(fc(xrp, 'fc_1_pred', nh=32, init_scale=np.sqrt(2))) X_r_hat = fc(xrp, 'fc_2r_pred', nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) noisy_targets = tf.stop_gradient(X_r) # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat)) self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.) def initial_state(self, n): return np.zeros((n, self.memsize), np.float32) def call(self, dict_obs, new, istate, update_obs_stats=False): for ob in dict_obs.values(): if ob is not None: if update_obs_stats: raise NotImplementedError ob = ob.astype(np.float32) ob = ob.reshape(-1, *self.ob_space.shape) self.ob_rms.update(ob) # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk. # It will use whatever observation spaces saved to disk along with other ctor params. feed1 = {self.ph_ob[k]: dict_obs[k][:, None] for k in self.ph_ob_keys} feed2 = { self.ph_istate: istate, self.ph_new: new[:, None].astype(np.float32) } feed1.update({ self.ph_mean: self.ob_rms.mean, self.ph_std: self.ob_rms.var**0.5 }) # for f in feed1: # print(f) a, vpred_int, vpred_ext, nlp, newstate, ent = tf.get_default_session( ).run([ self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout, self.nlp_samp, self.snext_rollout, self.entropy_rollout ], feed_dict={ **feed1, **feed2 }) return a[:, 0], vpred_int[:, 0], vpred_ext[:, 0], nlp[:, 0], newstate, ent[:, 0]
class CnnGruPolicy(StochasticPolicy): def __init__(self, scope, ob_space, ac_space, policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., dynamics_bonus=False, num_agents=1, rnd_type='rnd', div_type='oracle', indep_rnd=False, indep_policy=False, sd_type='oracle', rnd_mask_prob=1.): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size] rep_size = 512 self.rnd_mask = tf.placeholder(dtype=tf.float32, shape=(None, None, num_agents), name="rnd_mask") self.new_rnd_mask = tf.placeholder(dtype=tf.float32, shape=(None, None), name="new_rnd_mask") self.div_train_mask = tf.placeholder(dtype=tf.float32, shape=(None, None), name="div_train_mask") self.sample_agent_prob = tf.placeholder(dtype=tf.float32, shape=( None, None, ), name="sample_agent_prob") self.stage_label = tf.placeholder(dtype=tf.int32, shape=(None, None), name="stage_label") self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obstd") self.ph_count = tf.placeholder(dtype=tf.float32, shape=(), name="obcount") self.sep_ph_mean = tf.placeholder(dtype=tf.float32, shape=( None, None, ) + ob_space.shape[:2] + (1, ), name="sep_obmean") self.sep_ph_std = tf.placeholder(dtype=tf.float32, shape=( None, None, ) + ob_space.shape[:2] + (1, ), name="sep_obstd") self.sep_ph_count = tf.placeholder(dtype=tf.float32, shape=(), name="sep_obcount") self.game_score = tf.placeholder(dtype=tf.float32, shape=(None, None), name="game_score") self.last_rew_ob = tf.placeholder(dtype=ob_space.dtype, shape=(None, None) + tuple(ob_space.shape), name="last_rew_ob") self.div_ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="div_obmean") self.div_ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="div_obstd") self.idle_agent_label = tf.placeholder(dtype=tf.int32, shape=( None, None, ), name="idle_agent_label") self.rew_agent_label = tf.placeholder(dtype=tf.int32, shape=( None, None, ), name="rew_agent_label") #self.var_ph_mean = tf.get_variable("var_ph_mean", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0)) #self.var_ph_std = tf.get_variable("var_ph_std", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0)) #self.var_ph_count = tf.get_variable("var_ph_count", (), initializer=tf.constant_initializer(0.0)) self.sd_ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="sd_obmean") self.sd_ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="sd_obstd") memsize *= enlargement hidsize *= enlargement convfeat = 16 * enlargement self.ob_rms_list = [RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi= not update_ob_stats_independently_per_gpu) \ for _ in range(num_agents)] self.ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu) self.diversity_ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name='state') pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize self.num_agents = num_agents self.indep_rnd = indep_rnd self.indep_policy = indep_policy self.num_agents = num_agents if num_agents <= 0: self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_policy(self.ph_ob[None][:,:-1], ph_new=self.ph_new, ph_istate=ph_istate, reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_policy(self.ph_ob[None], ph_new=self.ph_new, ph_istate=ph_istate, reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) else: self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_multi_head_policy(self.ph_ob[None][:,:-1], ph_new=self.ph_new, ph_istate=ph_istate, reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_multi_head_policy(self.ph_ob[None], ph_new=self.ph_new, ph_istate=ph_istate, reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) if dynamics_bonus: self.define_dynamics_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) else: #self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) self.aux_loss, self.int_rew, self.feat_var, self.max_feat = self.define_multi_head_self_prediction_rew( convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) self.stage_rnd = tf.constant(1.) self.stage_prob = tf.constant(1.) if div_type == 'cls': with tf.variable_scope("div", reuse=False): #self.define_rew_discriminator(convfeat=convfeat, rep_size=256) with tf.variable_scope("int", reuse=False): self.disc_logits, self.all_div_prob, self.sp_prob, self.div_rew, self.disc_pd, self.disc_nlp = self.define_rew_discriminator_v2( convfeat=convfeat, rep_size=512, use_rew=True) else: self.div_rew = tf.constant(0.) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate @staticmethod def apply_policy(ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init): data_format = 'NHWC' ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info("CnnGruPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) / 255. X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope( scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): X = activ( conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format)) X = activ( conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format)) X = activ( conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format)) X = to2d(X) X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2))) X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize]) X, snext = tf.nn.dynamic_rnn(GRUCell(memsize, rec_gate_init=rec_gate_init), (X, ph_new[:, :, None]), dtype=tf.float32, time_major=False, initial_state=ph_istate) X = tf.reshape(X, (-1, memsize)) Xtout = X if extrahid: Xtout = X + activ( fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1)) X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1)) pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01) vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext def _build_policy_net(self, X, ph_new, ph_istate, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init): activ = tf.nn.relu data_format = 'NHWC' with tf.variable_scope(scope, reuse=reuse): X = activ( conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format)) X = activ( conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format)) X = activ( conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format)) X = to2d(X) X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2))) X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize]) X, snext = tf.nn.dynamic_rnn(GRUCell(memsize, rec_gate_init=rec_gate_init), (X, ph_new[:, :, None]), dtype=tf.float32, time_major=False, initial_state=ph_istate) X = tf.reshape(X, (-1, memsize)) Xtout = X if extrahid: Xtout = X + activ( fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1)) X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1)) pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01) vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01) return pdparam, vpred_int, vpred_ext, snext def apply_multi_head_policy(self, ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init): data_format = 'NHWC' ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info("CnnGruPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) / 255. X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) yes_gpu = any(get_available_gpus()) with tf.variable_scope( scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): all_pdparam = [] all_vint = [] all_vext = [] all_snext = [] for i in range(self.num_agents): scope = 'agent_{}'.format(str(i)) pdparam, vpred_int, vpred_ext, snext = self._build_policy_net( X=X, ph_new=ph_new, ph_istate=ph_istate, scope=scope, reuse=False, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=sy_nenvs, sy_nsteps=sy_nsteps, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init) if i == 0: #[batch,naction] - > [batch, 1, naction] all_pdparam = tf.expand_dims(pdparam, axis=1) #[batch,1] -> [batch,1,1] all_vint = tf.expand_dims(vpred_int, axis=1) all_vext = tf.expand_dims(vpred_ext, axis=1) all_snext = tf.expand_dims(snext, axis=1) else: all_pdparam = tf.concat( [all_pdparam, tf.expand_dims(pdparam, axis=1)], axis=1) all_vint = tf.concat( [all_vint, tf.expand_dims(vpred_int, axis=1)], axis=1) all_vext = tf.concat( [all_vext, tf.expand_dims(vpred_ext, axis=1)], axis=1) all_snext = tf.concat( [all_snext, tf.expand_dims(snext, axis=1)], axis=1) #[batch, nstep] -> [batch,nstep, ngroups] one_hot_gidx = tf.one_hot(self.ph_agent_idx, self.num_agents, axis=-1) #[batch,nstep, ngroups] -> [batch * nstep, ngroups,1] one_hot_gidx = tf.reshape(one_hot_gidx, (-1, self.num_agents, 1)) pdparam = tf.reduce_sum(one_hot_gidx * all_pdparam, axis=1) vpred_int = tf.reduce_sum(one_hot_gidx * all_vint, axis=1) vpred_ext = tf.reduce_sum(one_hot_gidx * all_vext, axis=1) snext = tf.reduce_sum(one_hot_gidx * all_snext, axis=1) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) snext = tf.reshape(snext, (sy_nenvs, memsize)) return pdparam, vpred_int, vpred_ext, snext def _build_target_net(self, target_x, scope, reuse, convfeat, rep_size, enlargement): with tf.variable_scope(scope, reuse=reuse): xr = tf.nn.leaky_relu( conv(target_x, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) rgbr = [to2d(xr)] X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) return X_r def _build_pred_net(self, pred_x, scope, reuse, convfeat, rep_size, enlargement): with tf.variable_scope(scope, reuse=reuse): xrp = tf.nn.leaky_relu( conv(pred_x, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xrp) # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(rgbrp, 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(X_r_hat, 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = fc(X_r_hat, 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2)) return X_r_hat def define_multi_head_self_prediction_rew(self, convfeat, rep_size, enlargement): logger.info( "Using multi-head RND BONUS ****************************************************" ) #RND bonus. # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] ph_mean = tf.reshape( self.sep_ph_mean, (-1, *self.sep_ph_mean.shape.as_list()[-3:])) ph_std = tf.reshape( self.sep_ph_std, (-1, *self.sep_ph_std.shape.as_list()[-3:])) target_x = xr = tf.clip_by_value((xr - ph_mean) / ph_std, -5.0, 5.0) all_target_out = [] #target_out = self._build_target_net(target_x, 'target_net', False, convfeat, rep_size, enlargement) for i in range(self.num_agents): scope = 'target_net_{}'.format(str(i)) target_out = self._build_target_net( target_x, scope, tf.AUTO_REUSE, convfeat, rep_size, enlargement) if i == 0: #[env*step, rep_size] -> [env*step, 1, rep_size] all_target_out = tf.expand_dims(target_out, axis=1) else: #[env*step, 1, rep_size] -> [env*step, num_agents , rep_size] all_target_out = tf.concat([ all_target_out, tf.expand_dims(target_out, axis=1) ], axis=1) # Predictor network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:, 1:] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] ph_mean = tf.reshape( self.sep_ph_mean, (-1, *self.sep_ph_mean.shape.as_list()[-3:])) ph_std = tf.reshape( self.sep_ph_std, (-1, *self.sep_ph_std.shape.as_list()[-3:])) pred_x = xrp = tf.clip_by_value((xrp - ph_mean) / ph_std, -5.0, 5.0) all_pred_out = [] for i in range(self.num_agents): scope = 'pred_net_{}'.format(str(i)) pred_out = self._build_pred_net(pred_x, scope, tf.AUTO_REUSE, convfeat, rep_size, enlargement) if i == 0: #[env*step, rep_size] -> [env*step, 1, rep_size] all_pred_out = tf.expand_dims(pred_out, axis=1) else: #[env*step, 1, rep_size] -> [env*step, num_agents , rep_size] all_pred_out = tf.concat( [all_pred_out, tf.expand_dims(pred_out, axis=1)], axis=1) #[env*step, num_agents , rep_size] -> [env*step, num_agents , 1] all_loss = tf.reduce_mean( tf.square(tf.stop_gradient(all_target_out) - all_pred_out), axis=-1, keep_dims=True) #[batch, nstep] -> [batch,nstep, ngroups] one_hot_gidx = tf.one_hot(self.ph_agent_idx, self.num_agents, axis=-1) #[batch,nstep, ngroups] -> [batch * nstep, ngroups,1] one_hot_gidx = tf.reshape(one_hot_gidx, (-1, self.num_agents, 1)) X_r = tf.reduce_sum(one_hot_gidx * all_target_out, axis=1) feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) max_feat = tf.reduce_max(tf.abs(X_r)) #[env*step, num_agents , 1] -> [env*step, 1] int_rew = tf.reduce_sum(one_hot_gidx * all_loss, axis=1) int_rew = tf.reshape(int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) #[env*step, num_agents ,1] rnd_mask = tf.reshape(self.rnd_mask, (-1, self.num_agents, 1)) rnd_mask = tf.cast(rnd_mask, tf.float32) #[env*step, num_agents , 1] -> [env*step] mask_loss = tf.reduce_sum(rnd_mask * all_loss, axis=[1, 2]) / tf.maximum( tf.reduce_sum(rnd_mask, axis=[1, 2]), 1.) aux_loss = mask_loss mask = tf.random_uniform(shape=tf.shape(aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) aux_loss = tf.reduce_sum(mask * aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.) return aux_loss, int_rew, feat_var, max_feat def define_rew_discriminator_v2(self, convfeat, rep_size, use_rew=False): output_shape = [self.sy_nenvs * (self.sy_nsteps - 1)] sample_prob = tf.reshape(self.sample_agent_prob, tf.stack(output_shape)) game_score = tf.reshape( self.game_score, tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1])) rew_agent_label = tf.reshape( self.rew_agent_label, tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1])) #rew_agent_label = tf.one_hot(self.rew_agent_label, self.num_agents, axis=-1) #rew_agent_label = tf.reshape(rew_agent_label,(-1,self.num_agents )) for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C phi = ph[:, 1:] phi = tf.cast(phi, tf.float32) phi = tf.reshape(phi, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] phi = phi / 255. last_rew_ob = self.last_rew_ob last_rew_ob = tf.cast(last_rew_ob, tf.float32) last_rew_ob = tf.reshape( last_rew_ob, (-1, *last_rew_ob.shape.as_list()[-3:]))[:, :, :, -1:] last_rew_ob = last_rew_ob / 255. if use_rew: phi = tf.concat([phi, last_rew_ob], axis=-1) phi = tf.nn.leaky_relu( conv(phi, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) #[20,20] [8,8] phi = tf.nn.leaky_relu( conv(phi, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) #[9,9] [7,7] phi = tf.nn.leaky_relu( conv(phi, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) phi = to2d(phi) phi = tf.nn.relu( fc(phi, 'fc1r', nh=rep_size, init_scale=np.sqrt(2))) phi = tf.nn.relu( fc(phi, 'fc2r', nh=rep_size, init_scale=np.sqrt(2))) disc_logits = fc(phi, 'fc3r', nh=self.num_agents, init_scale=np.sqrt(2)) one_hot_gidx = tf.one_hot(self.ph_agent_idx, self.num_agents, axis=-1) one_hot_gidx = tf.reshape(one_hot_gidx, (-1, self.num_agents)) flatten_all_div_prob = tf.nn.softmax(disc_logits, axis=-1) all_div_prob = tf.reshape( flatten_all_div_prob, (self.sy_nenvs, self.sy_nsteps - 1, self.num_agents)) sp_prob = tf.reduce_sum(one_hot_gidx * flatten_all_div_prob, axis=1) sp_prob = tf.reshape(sp_prob, (self.sy_nenvs, self.sy_nsteps - 1)) div_rew = -1 * tf.nn.softmax_cross_entropy_with_logits_v2( logits=disc_logits, labels=one_hot_gidx) base_rew = tf.log(0.01) div_rew = div_rew - tf.log(sample_prob) div_rew = tf.reshape(div_rew, (self.sy_nenvs, self.sy_nsteps - 1)) disc_pdtype = CategoricalPdType(self.num_agents) disc_pd = disc_pdtype.pdfromflat(disc_logits) disc_nlp = disc_pd.neglogp(rew_agent_label) return disc_logits, all_div_prob, sp_prob, div_rew, disc_pd, disc_nlp def define_self_prediction_rew(self, convfeat, rep_size, enlargement): #RND. # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu( conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) rgbr = [to2d(xr)] X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:, 1:] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu( conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xrp) X_r_hat = tf.nn.relu( fc(rgbrp, 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(X_r_hat, 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = fc(X_r_hat, 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) noisy_targets = tf.stop_gradient(X_r) self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.) def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement): #Dynamics based bonus. # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu( conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) rgbr = [to2d(xr)] X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2) assert ac_one_hot.get_shape().ndims == 3 assert ac_one_hot.get_shape().as_list() == [ None, None, self.ac_space.n ], ac_one_hot.get_shape().as_list() ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n)) def cond(x): return tf.concat([x, ac_one_hot], 1) for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:, :-1] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:])) # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok? xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu( conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xrp) # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(cond(rgbrp), 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(cond(X_r_hat), 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = fc(cond(X_r_hat), 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) noisy_targets = tf.stop_gradient(X_r) self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.) def initial_state(self, n): return np.zeros((n, self.memsize), np.float32) def call(self, dict_obs, new, istate, agent_idx, update_obs_stats=False): for ob in dict_obs.values(): if ob is not None: if update_obs_stats: raise NotImplementedError ob = ob.astype(np.float32) ob = ob.reshape(-1, *self.ob_space.shape) self.ob_rms.update(ob) # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk. # It will use whatever observation spaces saved to disk along with other ctor params. feed1 = {self.ph_ob[k]: dict_obs[k][:, None] for k in self.ph_ob_keys} feed2 = { self.ph_istate: istate, self.ph_new: new[:, None].astype(np.float32) } #feed1.update({self.ph_mean: self.ob_rms.mean, self.ph_std: self.ob_rms.var ** 0.5}) feed1.update({self.ph_agent_idx: agent_idx}) # for f in feed1: # print(f) a, vpred_int, vpred_ext, nlp, newstate, ent = tf_util.get_session( ).run([ self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout, self.nlp_samp, self.snext_rollout, self.entropy_rollout ], feed_dict={ **feed1, **feed2 }) base_vpred_ext = np.ones_like(vpred_ext) return a[:, 0], vpred_int[:, 0], vpred_ext[:, 0], nlp[:, 0], newstate, ent[:, 0], base_vpred_ext[:, 0] def get_ph_mean_std(self): mean, std = tf.get_default_session().run( [self.var_ph_mean, self.var_ph_std]) return mean, std
class CnnGruPolicy(StochasticPolicy): def __init__(self, scope, ob_space, ac_space, policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., exploration_type='bottleneck', beta=1e-3, rew_counter=None): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size] rep_size = 512 self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obstd") memsize *= enlargement hidsize *= enlargement convfeat = 16 * enlargement self.ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name='state') pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize # For training self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_policy(self.ph_ob[None][:,:-1], ph_new=self.ph_new, ph_istate=ph_istate, reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) # For inference self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_policy(self.ph_ob[None], ph_new=self.ph_new, ph_istate=ph_istate, reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) self.define_bottleneck_rew(convfeat=convfeat, rep_size=rep_size / 8, enlargement=enlargement, beta=beta, rew_counter=rew_counter) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate @staticmethod def apply_policy(ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init): data_format = 'NHWC' ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info("CnnGruPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) / 255. X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope( scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): X = activ( conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format)) X = activ( conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format)) X = activ( conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format)) X = to2d(X) X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2))) X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize]) X, snext = tf.nn.dynamic_rnn(GRUCell(memsize, rec_gate_init=rec_gate_init), (X, ph_new[:, :, None]), dtype=tf.float32, time_major=False, initial_state=ph_istate) X = tf.reshape(X, (-1, memsize)) Xtout = X if extrahid: Xtout = X + activ( fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1)) X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1)) pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01) vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext def define_bottleneck_rew(self, convfeat, rep_size, enlargement, beta=1e-2, rew_counter=None): logger.info( "Using Curiosity Bottleneck ****************************************************" ) v_target = tf.reshape(self.ph_ret_ext, (-1, 1)) if rew_counter is None: sched_coef = 1. else: sched_coef = tf.minimum(rew_counter / 1000, 1.) # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu( conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) rgbr = [to2d(xr)] mu = fc(rgbr[0], 'fc_mu', nh=rep_size, init_scale=np.sqrt(2)) sigma = tf.nn.softplus( fc(rgbr[0], 'fc_sigma', nh=rep_size, init_scale=np.sqrt(2))) z = mu + sigma * tf.random_normal( tf.shape(mu), 0, 1, dtype=tf.float32) v = fc(z, 'value', nh=1, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(sigma) self.max_feat = tf.reduce_max(tf.abs(z)) self.kl = 0.5 * tf.reduce_sum(tf.square(mu) + tf.square(sigma) - tf.log(1e-8 + tf.square(sigma)) - 1, axis=-1, keep_dims=True) self.int_rew = tf.stop_gradient(self.kl) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) self.aux_loss = sched_coef * tf.square(v_target - v) + beta * self.kl mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.) def initial_state(self, n): return np.zeros((n, self.memsize), np.float32) def call(self, dict_obs, new, istate, update_obs_stats=False): for ob in dict_obs.values(): if ob is not None: if update_obs_stats: raise NotImplementedError ob = ob.astype(np.float32) ob = ob.reshape(-1, *self.ob_space.shape) self.ob_rms.update(ob) # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk. # It will use whatever observation spaces saved to disk along with other ctor params. feed1 = {self.ph_ob[k]: dict_obs[k][:, None] for k in self.ph_ob_keys} feed2 = { self.ph_istate: istate, self.ph_new: new[:, None].astype(np.float32) } feed1.update({ self.ph_mean: self.ob_rms.mean, self.ph_std: self.ob_rms.var**0.5 }) # for f in feed1: # print(f) a, vpred_int, vpred_ext, nlp, newstate, ent = tf.get_default_session( ).run([ self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout, self.nlp_samp, self.snext_rollout, self.entropy_rollout ], feed_dict={ **feed1, **feed2 }) return a[:, 0], vpred_int[:, 0], vpred_ext[:, 0], nlp[:, 0], newstate, ent[:, 0]
class CnnPolicy(StochasticPolicy): def __init__(self, scope, ob_space, ac_space, policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., dynamics_bonus=False, action_balance_coef=1., array_action=True): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update self.action_balance_coef = action_balance_coef self.array_action = array_action self.enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size] self.rep_size = 512 self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obstd") memsize *= self.enlargement hidsize *= self.enlargement self.convfeat = 16 * self.enlargement self.ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name='state') pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize # self.int_rew_ab = None # self.int_rew_ab_opt = None if self.action_balance_coef is not None: # self.action_one_hot_list_rollout = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps) # self.action_one_hot_list_opt = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1) # with tf.device('/cpu:0'): self.action_one_hot_rollout = get_action_one_hot( self.ac_space.n, self.sy_nenvs, self.sy_nsteps) # self.action_one_hot_list_opt = get_action_one_hot(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1) if self.array_action: # with tf.device('/cpu:0'): self.action_encode_array_rollout = get_action_encode_array( self.ac_space.n, self.sy_nenvs, self.sy_nsteps, ob_space.shape[:2]) # self.action_encode_array_rollout, self.split_lengths = get_action_encode_array( # self.ac_space.n, self.sy_nenvs, self.sy_nsteps, ob_space.shape[:2]) self.feat_var_ab, self.max_feat_ab, self.int_rew_ab, self.int_rew_ab_rollout, self.aux_loss_ab = \ self.define_action_balance_rew(ph_ob=self.ph_ob[None], action_one_hot=self.action_one_hot_rollout, convfeat=self.convfeat, rep_size=self.rep_size, enlargement=self.enlargement, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, ) # self.feat_var_ab_opt, self.max_feat_ab_opt, self.int_rew_ab_opt, self.aux_loss_ab = \ # self.define_action_balance_rew(ph_ob=self.ph_ob[None][:, :-1], # action_one_hot=self.action_one_hot_list_opt, # convfeat=self.convfeat, # rep_size=self.rep_size, enlargement=self.enlargement, # sy_nenvs=self.sy_nenvs, # sy_nsteps=self.sy_nsteps - 1, # ) self.pd_ab = self.pdtype.pdfromflat(self.int_rew_ab) # Inputs to policy and value function will have different shapes depending on whether it is rollout # or optimization time, so we treat separately. self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt, self.logits_raw_opt = \ self.apply_policy(self.ph_ob[None][:, :-1], reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize ) self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout, _ = \ self.apply_policy(self.ph_ob[None], reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize ) if dynamics_bonus: self.define_dynamics_prediction_rew(convfeat=self.convfeat, rep_size=self.rep_size, enlargement=self.enlargement) else: self.define_self_prediction_rew(convfeat=self.convfeat, rep_size=self.rep_size, enlargement=self.enlargement) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate def apply_policy( self, ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, ): data_format = 'NHWC' ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info("CnnPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) / 255. X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope( scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): X = activ( conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format)) X = activ( conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format)) X = activ( conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format)) X = to2d(X) mix_other_observations = [X] X = tf.concat(mix_other_observations, axis=1) X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2))) additional_size = 448 X = activ( fc(X, 'fc_additional', nh=additional_size, init_scale=np.sqrt(2))) snext = tf.zeros((sy_nenvs, memsize)) mix_timeout = [X] Xtout = tf.concat(mix_timeout, axis=1) if extrahid: Xtout = X + activ( fc(Xtout, 'fc2val', nh=additional_size, init_scale=0.1)) X = X + activ( fc(X, 'fc2act', nh=additional_size, init_scale=0.1)) pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01) vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) logits_raw = pdparam if self.action_balance_coef is not None: # self.define_action_balance_rew(convfeat=self.convfeat, rep_size=self.rep_size, enlargement=self.enlargement) pdparam = pdparam + tf.stop_gradient( self.int_rew_ab_rollout[:, :sy_nsteps] * self.action_balance_coef) # pdparam = pdparam + tf.stop_gradient(self.int_rew_ab_rollout * self.action_balance_coef) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext, logits_raw def define_action_balance_rew(self, ph_ob, action_one_hot, convfeat, rep_size, enlargement, sy_nenvs, sy_nsteps, l2_normalize=True, sd_normalize=False): logger.info( "Using Action Balance BONUS ****************************************************" ) with tf.variable_scope('action_balance', reuse=tf.AUTO_REUSE): # Random target network. ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) def conv_layers(xr): xr = tf.nn.leaky_relu( conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) return xr if self.array_action: # with tf.device('/cpu:0'): xr = tf.reshape(tf.tile(xr, [1, self.ac_space.n, 1, 1]), (-1, *xr.shape[1:])) xr = tf.concat( [xr, self.action_encode_array_rollout[..., None]], axis=-1) xr = conv_layers(xr) # when n_env=128, the batch size is too big for GPU. Split inputs in order to use less memory. # xr_results = [] # xr_list = tf.split(xr, num_or_size_splits=self.split_lengths) # state_shape = xr_list[0].shape[1:] # for i in range(len(xr_list)): # action_array_tmp = tf.tile(self.action_encode_array_rollout, (self.split_lengths[i], 1, 1)) # xr = tf.reshape(tf.tile(xr_list[i], [1, self.ac_space.n, 1, 1]), (-1, *state_shape)) # # xr = tf.concat([xr, self.action_encode_array_list_rollout[i][..., None]], axis=-1) # xr = tf.concat([xr, action_array_tmp[..., None]], axis=-1) # xr = conv_layers(xr) # xr_results.append(xr) # xr = tf.concat(xr_results, 0) else: xr = conv_layers(xr) rgbr = to2d(xr) if not self.array_action: # extend action dim rgbr_shape = rgbr.shape.as_list() rgbr = tf.reshape(tf.tile(rgbr, [1, self.ac_space.n]), (-1, rgbr_shape[1])) X_r = tf.nn.relu( fc(tf.concat([rgbr, action_one_hot], 1), 'fc1r', nh=256, init_scale=np.sqrt(2))) X_r = fc(tf.concat([X_r, action_one_hot], 1), 'fc2r', nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) # xrp = ph[:, :-1] xrp = ph xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:])) # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok? xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu( conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xrp) rgbrp_shape = rgbrp.shape.as_list() rgbrp = tf.reshape(tf.tile(rgbrp, [1, self.ac_space.n]), (-1, rgbrp_shape[1])) X_r_hat = tf.nn.relu( fc(tf.concat([rgbrp, action_one_hot], 1), 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(tf.concat([X_r_hat, action_one_hot], 1), 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = fc(tf.concat([X_r_hat, action_one_hot], 1), 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2)) X_r = tf.reshape(X_r, (sy_nenvs, sy_nsteps, self.ac_space.n, rep_size)) X_r_hat = tf.reshape( X_r_hat, (sy_nenvs, sy_nsteps, self.ac_space.n, rep_size)) int_rew_ab_rollout = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1) if l2_normalize: int_rew_ab_rollout = tf.math.l2_normalize(int_rew_ab_rollout, axis=-1) elif sd_normalize: mean_tmp, var_tmp = tf.nn.moments(int_rew_ab_rollout, axes=[-1], keep_dims=True) int_rew_ab_rollout = (int_rew_ab_rollout - mean_tmp) / tf.math.sqrt(var_tmp) X_r = X_r[:, :-1] X_r_hat = X_r_hat[:, :-1] feat_var_ab = tf.reduce_mean(tf.nn.moments(X_r, axes=[0, 1])[1]) max_feat_ab = tf.reduce_max(tf.abs(X_r)) int_rew_ab = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1) if l2_normalize: logger.info("Normalize logits:l2") int_rew_ab = tf.math.l2_normalize(int_rew_ab, axis=-1) elif sd_normalize: logger.info("Normalize logits:standard") mean_tmp, var_tmp = tf.nn.moments(int_rew_ab, axes=[-1], keep_dims=True) int_rew_ab = (int_rew_ab - mean_tmp) / tf.math.sqrt(var_tmp) # int_rew_ab = tf.reshape(int_rew_ab, (sy_nenvs, sy_nsteps, *int_rew_ab.shape.as_list()[1:])) # int_rew_ab = tf.reshape(int_rew_ab, (sy_nenvs, sy_nsteps, self.ac_space.n)) # self.int_rew_ab = tf.reshape(self.int_rew_ab, (self.sy_nenvs, self.sy_nsteps - 1, self.ac_space.n)) noisy_targets = tf.stop_gradient(X_r) # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat)) aux_loss_ab = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), [-1]) mask = tf.random_uniform(shape=tf.shape(aux_loss_ab), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast( mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) aux_loss_ab = tf.reduce_sum(mask * aux_loss_ab) / tf.maximum( tf.reduce_sum(mask), 1.) return feat_var_ab, max_feat_ab, int_rew_ab, int_rew_ab_rollout, aux_loss_ab def define_self_prediction_rew(self, convfeat, rep_size, enlargement): logger.info( "Using RND BONUS ****************************************************" ) # RND bonus. # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu( conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) rgbr = [to2d(xr)] X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:, 1:] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu( conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xrp) # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(rgbrp, 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(X_r_hat, 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = fc(X_r_hat, 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) targets = tf.stop_gradient(X_r) # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat)) self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.) def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement): # Dynamics loss with random features. # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu( conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) rgbr = [to2d(xr)] X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2) assert ac_one_hot.get_shape().ndims == 3 assert ac_one_hot.get_shape().as_list() == [ None, None, self.ac_space.n ], ac_one_hot.get_shape().as_list() ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n)) def cond(x): return tf.concat([x, ac_one_hot], 1) for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:, :-1] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:])) # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok? xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu( conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xrp) # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(cond(rgbrp), 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(cond(X_r_hat), 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = fc(cond(X_r_hat), 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) noisy_targets = tf.stop_gradient(X_r) # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat)) self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.) def initial_state(self, n): return np.zeros((n, self.memsize), np.float32) def call(self, dict_obs, new, istate, update_obs_stats=False): for ob in dict_obs.values(): if ob is not None: if update_obs_stats: raise NotImplementedError ob = ob.astype(np.float32) ob = ob.reshape(-1, *self.ob_space.shape) self.ob_rms.update(ob) # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk. # It will use whatever observation spaces saved to disk along with other ctor params. feed1 = {self.ph_ob[k]: dict_obs[k][:, None] for k in self.ph_ob_keys} feed2 = { self.ph_istate: istate, self.ph_new: new[:, None].astype(np.float32) } feed1.update({ self.ph_mean: self.ob_rms.mean, self.ph_std: self.ob_rms.var**0.5 }) # for f in feed1: # print(f) a, vpred_int, vpred_ext, nlp, newstate, ent = tf.get_default_session( ).run([ self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout, self.nlp_samp, self.snext_rollout, self.entropy_rollout ], feed_dict={ **feed1, **feed2 }) return a[:, 0], vpred_int[:, 0], vpred_ext[:, 0], nlp[:, 0], newstate, ent[:, 0]