def apply_policy(ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init): data_format = 'NHWC' ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info("CnnGruPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) / 255. X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope(scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): X = activ(conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format)) X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format)) X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format)) X = to2d(X) X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2))) X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize]) X, snext = tf.nn.dynamic_rnn( GRUCell(memsize, rec_gate_init=rec_gate_init), (X, ph_new[:,:,None]), dtype=tf.float32, time_major=False, initial_state=ph_istate) X = tf.reshape(X, (-1, memsize)) Xtout = X if extrahid: Xtout = X + activ(fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1)) X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1)) pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01) vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext
def apply_policy(ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize): data_format = 'NHWC' ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info("CnnPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) / 255. X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope(scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): X = activ(conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format)) X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format)) X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format)) X = to2d(X) mix_other_observations = [X] X = tf.concat(mix_other_observations, axis=1) X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2))) additional_size = 448 X = activ(fc(X, 'fc_additional', nh=additional_size, init_scale=np.sqrt(2))) snext = tf.zeros((sy_nenvs, memsize)) mix_timeout = [X] Xtout = tf.concat(mix_timeout, axis=1) if extrahid: Xtout = X + activ(fc(Xtout, 'fc2val', nh=additional_size, init_scale=0.1)) X = X + activ(fc(X, 'fc2act', nh=additional_size, init_scale=0.1)) pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01) vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext
def apply_policy(ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, use_action_balance=None): ph = ph_ob assert len(ph.shape.as_list()) == 3 # B,T,S logger.info("Mlp Policy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) X = tf.reshape(X, (-1, *ph.shape.as_list()[-1:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope( scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): X = activ(fc(X, 'fc_0', nh=hidsize, init_scale=np.sqrt(2))) mix_other_observations = [X] X = tf.concat(mix_other_observations, axis=1) X = activ(fc(X, 'fc_1', nh=hidsize, init_scale=np.sqrt(2))) additional_size = 64 X = activ( fc(X, 'fc_additional', nh=additional_size, init_scale=np.sqrt(2))) snext = tf.zeros((sy_nenvs, memsize)) mix_timeout = [X] Xtout = tf.concat(mix_timeout, axis=1) if extrahid: Xtout = X + activ( fc(Xtout, 'fc2val', nh=additional_size, init_scale=0.1)) X = X + activ( fc(X, 'fc2act', nh=additional_size, init_scale=0.1)) pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01) vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01) # if use_action_balance: pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext
def __init__(self, orders, d, coeffs, name=None): self.orders = orders self.d = d self.coeffs = coeffs self.input_dim = self.orders.shape[1] self.name = name gpu_devices = U.get_available_gpus() if gpu_devices: device = gpu_devices[0] else: cpu_devices = U.get_available_cpus() device = cpu_devices[0] with tf.device(device): self.declare_vars()
def __init__(self, res=None, activation=None, keras=False, model=None, reuse=False): if not keras: # activation type activations = activation.split('_') if len(activations) > 1: self.activation = activations[0] self.last_layer_activation = activations[1] else: self.activation = activation self.last_layer_activation = None # affine mapping of the output self.offset = res[-2] self.scale_factor = res[-1] # parse structure of neural networks self.num_of_inputs = int(res[0]) self.num_of_outputs = int(res[1]) self.num_of_hidden_layers = int(res[2]) self.network_structure = np.zeros(self.num_of_hidden_layers + 1, dtype=int) self.activations = [self.activation ] * (self.num_of_hidden_layers + 1) if self.last_layer_activation is not None: self.activations[-1] = self.last_layer_activation # pointer is current reading index self.pointer = 3 # num of neurons of each layer for i in range(self.num_of_hidden_layers): self.network_structure[i] = int(res[self.pointer]) self.pointer += 1 # output layer self.network_structure[-1] = self.num_of_outputs # all values from the text file self.param = res # store the weights and bias in two lists # self.weights # self.bias gpu_devices = U.get_available_gpus() if gpu_devices: device = gpu_devices[0] else: cpu_devices = U.get_available_cpus() device = cpu_devices[0] with tf.device(device): self.parse_w_b() self.x = tf.placeholder(tf.float64, shape=[None, self.num_of_inputs], name='input') self.y = self.tensorflow_representation(self.x, reuse=reuse) else: params = [] self.weights = [] self.bias = [] for layer in model.layers: params.append(layer.get_weights()) # list of numpy arrays for param in params: if len(param) == 0: continue else: self.weights.append(param[0]) self.bias.append(param[1]) self.model = model
def apply_policy( ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, additional_inputs=None, ): meta_rl = False data_format = "NHWC" ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info( f"CnnPolicy: using '{ph.name}' shape {ph.shape} as image input") X = tf.cast(ph, tf.float32) / 255.0 X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope( scope, reuse=reuse), tf.device("/gpu:0" if yes_gpu else "/cpu:0"): X = activ( conv( X, "c1", nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format, )) X = activ( conv( X, "c2", nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format, )) X = activ( conv( X, "c3", nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format, )) X = to2d(X) mix_other_observations = [X] if ('prev_acs' in additional_inputs) and ('prev_rew' in additional_inputs): # Cast numpy arrays to tf tensors prev_acs = tf.cast(additional_inputs['prev_acs'], tf.float32) prev_rew = tf.cast(additional_inputs['prev_rew'], tf.float32) # Flatten out time dimension prev_acs = tf.reshape(prev_acs, (-1, *prev_acs.shape.as_list()[2:])) prev_rew = tf.reshape(prev_rew, (-1, *prev_rew.shape.as_list()[2:])) # Add to 2D features going to FC layers mix_other_observations.extend([prev_acs, prev_rew]) X = tf.concat(mix_other_observations, axis=1) X = activ(fc(X, "fc1", nh=hidsize, init_scale=np.sqrt(2))) additional_size = 448 X = activ( fc(X, "fc_additional", nh=additional_size, init_scale=np.sqrt(2))) snext = tf.zeros((sy_nenvs, memsize)) mix_timeout = [X] Xtout = tf.concat(mix_timeout, axis=1) if extrahid: Xtout = X + activ( fc(Xtout, "fc2val", nh=additional_size, init_scale=0.1)) X = X + activ( fc(X, "fc2act", nh=additional_size, init_scale=0.1)) pdparam = fc(X, "pd", nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, "vf_int", nh=1, init_scale=0.01) vpred_ext = fc(Xtout, "vf_ext", nh=1, init_scale=0.01) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext
def apply_policy(self, ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init): data_format = 'NHWC' ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info("CnnGruPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) / 255. # (None, 84, 84, 4) in case of MontezumaRevengeNoFrameskip X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope( scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): X = activ( conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format)) #X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format)) #X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format)) # over 14k rewards with these 2 and only the first conv layer # with tf.variable_scope("augmented1"): # X = self.augmented_conv2d(X, 256, dk=24, dv=24) # with tf.variable_scope("augmented2"): # X = self.augmented_conv2d(X, 256, dk=24, dv=24) # 5.8k rewards 3 levels with these 2 and the first 2 conv layers # with tf.variable_scope("augmented1"): # X = self.augmented_conv2d(X, 512, dk=256, dv=256) # with tf.variable_scope("augmented2"): # X = self.augmented_conv2d(X, 512, dk=256, dv=256) with tf.variable_scope("augmented1"): X = self.augmented_conv2d(X, 256, dk=24, dv=24) with tf.variable_scope("augmented2"): X = self.augmented_conv2d(X, 256, dk=24, dv=24) X = to2d(X) X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2))) X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize]) X, snext = tf.nn.dynamic_rnn(GRUCell(memsize, rec_gate_init=rec_gate_init), (X, ph_new[:, :, None]), dtype=tf.float32, time_major=False, initial_state=ph_istate) X = tf.reshape(X, (-1, memsize)) Xtout = X if extrahid: Xtout = X + activ( fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1)) X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1)) pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01) vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext
def apply_policy( ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init, ): ph = ph_ob logger.info( f"CnnGruPolicy: using '{ph.name}' shape {ph.shape} as image input") assert len(ph.shape.as_list()) == 3 # B, Envs, Features X = tf.cast(ph, tf.float32) / 255.0 X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope( scope, reuse=reuse), tf.device("/gpu:0" if yes_gpu else "/cpu:0"): X = activ(fc( X, "fc1", nh=32, init_scale=np.sqrt(2), )) X = activ(fc( X, "fc2", nh=64, init_scale=np.sqrt(2), )) X = activ(fc( X, "fc3", nh=64, init_scale=np.sqrt(2), )) X = to2d(X) X = activ(fc(X, "fc1", nh=hidsize, init_scale=np.sqrt(2))) X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize]) X, snext = tf.nn.dynamic_rnn( GRUCell(memsize, rec_gate_init=rec_gate_init), (X, ph_new[:, :, None]), dtype=tf.float32, time_major=False, initial_state=ph_istate, ) X = tf.reshape(X, (-1, memsize)) Xtout = X if extrahid: Xtout = X + activ( fc(Xtout, "fc2val", nh=memsize, init_scale=0.1)) X = X + activ(fc(X, "fc2act", nh=memsize, init_scale=0.1)) pdparam = fc(X, "pd", nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, "vf_int", nh=1, init_scale=0.01) vpred_ext = fc(Xtout, "vf_ext", nh=1, init_scale=0.01) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext
def apply_multi_head_policy(self, ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init): data_format = 'NHWC' ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info("CnnGruPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) / 255. X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) yes_gpu = any(get_available_gpus()) with tf.variable_scope( scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): all_pdparam = [] all_vint = [] all_vext = [] all_snext = [] for i in range(self.num_agents): scope = 'agent_{}'.format(str(i)) pdparam, vpred_int, vpred_ext, snext = self._build_policy_net( X=X, ph_new=ph_new, ph_istate=ph_istate, scope=scope, reuse=False, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=sy_nenvs, sy_nsteps=sy_nsteps, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init) if i == 0: #[batch,naction] - > [batch, 1, naction] all_pdparam = tf.expand_dims(pdparam, axis=1) #[batch,1] -> [batch,1,1] all_vint = tf.expand_dims(vpred_int, axis=1) all_vext = tf.expand_dims(vpred_ext, axis=1) all_snext = tf.expand_dims(snext, axis=1) else: all_pdparam = tf.concat( [all_pdparam, tf.expand_dims(pdparam, axis=1)], axis=1) all_vint = tf.concat( [all_vint, tf.expand_dims(vpred_int, axis=1)], axis=1) all_vext = tf.concat( [all_vext, tf.expand_dims(vpred_ext, axis=1)], axis=1) all_snext = tf.concat( [all_snext, tf.expand_dims(snext, axis=1)], axis=1) #[batch, nstep] -> [batch,nstep, ngroups] one_hot_gidx = tf.one_hot(self.ph_agent_idx, self.num_agents, axis=-1) #[batch,nstep, ngroups] -> [batch * nstep, ngroups,1] one_hot_gidx = tf.reshape(one_hot_gidx, (-1, self.num_agents, 1)) pdparam = tf.reduce_sum(one_hot_gidx * all_pdparam, axis=1) vpred_int = tf.reduce_sum(one_hot_gidx * all_vint, axis=1) vpred_ext = tf.reduce_sum(one_hot_gidx * all_vext, axis=1) snext = tf.reduce_sum(one_hot_gidx * all_snext, axis=1) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) snext = tf.reshape(snext, (sy_nenvs, memsize)) return pdparam, vpred_int, vpred_ext, snext
def define_self_prediction_rew(self, convfeat, rep_size, enlargement, scope): #RND. # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu( conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu( conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) rgbr = [to2d(xr)] X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) #define expert agent observations random features # yes_gpu = any(get_available_gpus()) with tf.variable_scope( tf.get_variable_scope(), reuse=True), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): X_im = np.load(os.getcwd() + '/policies/obs.npy') Xr_im = tf.cast(X_im, tf.float32) / 255. Xr_im = tf.reshape(Xr_im, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] Xr_im = tf.clip_by_value((Xr_im - tf.reduce_mean(Xr_im)) / (tf.math.reduce_std(Xr_im)**0.5), -5.0, 5.0) Xr_im = tf.nn.leaky_relu( conv(Xr_im, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) Xr_im = tf.nn.leaky_relu( conv(Xr_im, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) Xr_im = tf.nn.leaky_relu( conv(Xr_im, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) Xr_im = [to2d(Xr_im)[::self.demonstration_stride]] Xr_im = fc(Xr_im[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) Xr_im = tf.stop_gradient(Xr_im) # Predictor network. for ph in self.ph_ob.values(): if len(ph.shape.as_list() ) == 5: # B,T,H,W,C ###Batch time height width color? logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:, 1:] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu( conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu( conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xrp) X_r_hat = tf.nn.relu( fc(rgbrp, 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu( fc(X_r_hat, 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = fc(X_r_hat, 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) #### #self.im_rew = tf.math.maximum(1 - tf.divide(tf.reduce_mean(tf.square(self.Xr_im[:(X_r).shape[0]] - X_r), axis=-1, keep_dims=True),tf.add(tf.reduce_mean(tf.square(self.Xr_im[:X_r.shape[0]]), axis=-1, keep_dims=True),tf.reduce_mean(tf.square(X_r), axis=-1, keep_dims=True))),tf.constant(0.5)) im_rew = tf.reduce_mean(tf.tensordot(tf.stop_gradient(X_r), Xr_im, axes=[[1], [1]]), axis=1) im_rew = tf.reshape(im_rew, (self.sy_nenvs, self.sy_nsteps - 1)) #self.int_rew =tf.math.maximum(self.im_rew,self.int_rew) self.int_rew = self.int_rew * (1 + tf.math.tanh(im_rew / 100)) #### noisy_targets = tf.stop_gradient(X_r) self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.)