class CNNPolicy(object): def __init__(self,sess,p, train_phase=True,has_state = False): with tf.variable_scope("model",reuse = train_phase) as scope: # Reuse = true for training phase # Initialization of placeholders X = tf.placeholder(tf.uint8, p.OBS_SHAPE) #obs S = tf.placeholder(tf.float32,p.STATE_SHAPE) scaled_x = tf.cast(X, tf.float32) / 255. # Additional Functions which may be needed relu_activ = tf.nn.relu #Relu Activation normalize = lambda layer,phase : tf.layers.batch_normalization(layer, center=True,scale=True, training=train_phase) # Batch Normalization # Model Details #h1 = relu_activ(conv(scaled_x,scope = 'conv1', nf = 10, rf = 5, stride = 1,init_scale=np.sqrt(2))) #h2 = relu_activ(conv(h1,scope = 'conv2', nf = 10, rf = 3, stride = 1)) flattened_x = conv_to_fc(scaled_x) h1 = relu_activ(fc(flattened_x,scope = 'fc1', nh = 20,init_scale=np.sqrt(2))) h2 = relu_activ(fc(h1,scope = 'fc2', nh = 15,init_scale=np.sqrt(2))) hconcat = tf.concat([h2,S],axis=1) h3 = relu_activ(fc(hconcat,scope = 'fc3', nh = 10,init_scale=np.sqrt(2))) hcommon = relu_activ(fc(h3,scope = 'fcommon', nh = 10,init_scale=np.sqrt(2))) pi = fc(hcommon, scope = "policy" , nh = 3,init_scale=0.01) vf = fc(hcommon, scope = "value" , nh = 1) self.pd_type = CategoricalPdType(p.NUM_ACTIONS) self.pd = self.pd_type.pdfromflat(pi) # Sampling from action distribution as per baselines # Sample from the distribution v0 = vf[:, 0] # To remove extra dimension a0 = self.pd.sample() # Sample from distribution neglogp0 = self.pd.neglogp(a0) #Self entropy of selected action self.initial_state = None # Not required for CNN (only for RNN Models) # Interfaces to the outer world def step(ob, state, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob,S:state}) return a, v, neglogp def value(ob,state, *_args, **_kwargs): return sess.run(v0, {X:ob,S:state}) def hidden_value(ob,state,*_args, **_kwargs): """ Created for debugging purposes """ #amodel = np.argmax(np.array(sess.run([pi], {X:ob,S:state})).flatten()) #a = sess.run([a0], {X:ob,S:state}) #adict = {"amodel":amodel,"asampler":a} return sess.run([hcommon], {X:ob,S:state}) self.pi = pi self.vf = vf self.X = X self.S = S self.step = step self.value = value self.hidden_value = hidden_value # Required for debugging purpose
def build_act(make_obs_ph, q_func, hr_func, num_actions, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") update_rl_importance_ph = tf.placeholder(tf.float32, (), name="update_rl_importance") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) rl_importance = tf.get_variable("rl_importance", (), initializer=tf.constant_initializer(0)) q_values = q_func(observations_ph.get(), num_actions, scope="q_func") deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) rl_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) predicted_feedback = hr_func(observations_ph.get(), num_actions, scope="hr_func") fb_logit_constant = 10 hr_pdtype = CategoricalPdType(num_actions) hr_pd = hr_pdtype.pdfromflat(predicted_feedback * fb_logit_constant) hr_actions = hr_pd.sample() chose_rl = tf.random_uniform( tf.stack([batch_size ]), minval=0, maxval=1, dtype=tf.float32) < rl_importance output_actions = tf.where(chose_rl, rl_actions, hr_actions) update_eps_expr = eps.assign( tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) update_rl_importance_expr = rl_importance.assign( tf.cond(update_rl_importance_ph >= 0, lambda: update_rl_importance_ph, lambda: rl_importance)) _act = U.function(inputs=[ observations_ph, stochastic_ph, update_eps_ph, update_rl_importance_ph ], outputs=output_actions, givens={ update_eps_ph: -1.0, update_rl_importance_ph: -1.0, stochastic_ph: True }, updates=[update_eps_expr, update_rl_importance_expr]) def act(ob, stochastic=True, update_eps=-1, update_rl_importance_expr=-1): return _act(ob, stochastic, update_eps, update_rl_importance_expr) return act
def build_act_with_param_noise(make_obs_ph, q_func, hr_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None): """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905): Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ if param_noise_filter_func is None: param_noise_filter_func = default_param_noise_filter with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") update_param_noise_threshold_ph = tf.placeholder( tf.float32, (), name="update_param_noise_threshold") update_param_noise_scale_ph = tf.placeholder( tf.bool, (), name="update_param_noise_scale") reset_ph = tf.placeholder(tf.bool, (), name="reset") update_rl_importance_ph = tf.placeholder(tf.float32, (), name="update_rl_importance") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) param_noise_scale = tf.get_variable( "param_noise_scale", (), initializer=tf.constant_initializer(0.01), trainable=False) param_noise_threshold = tf.get_variable( "param_noise_threshold", (), initializer=tf.constant_initializer(0.05), trainable=False) rl_importance = tf.get_variable("rl_importance", (), initializer=tf.constant_initializer(0)) # Unmodified Q. q_values = q_func(observations_ph.get(), num_actions, scope="q_func") # Perturbable Q used for the actual rollout. q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func") # We have to wrap this code into a function due to the way tf.cond() works. See # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for # a more detailed discussion. def perturb_vars(original_scope, perturbed_scope): all_vars = scope_vars(absolute_scope_name(original_scope)) all_perturbed_vars = scope_vars( absolute_scope_name(perturbed_scope)) assert len(all_vars) == len(all_perturbed_vars) perturb_ops = [] for var, perturbed_var in zip(all_vars, all_perturbed_vars): if param_noise_filter_func(perturbed_var): # Perturb this variable. op = tf.assign( perturbed_var, var + tf.random_normal(shape=tf.shape(var), mean=0., stddev=param_noise_scale)) else: # Do not perturb, just assign. op = tf.assign(perturbed_var, var) perturb_ops.append(op) assert len(perturb_ops) == len(all_vars) return tf.group(*perturb_ops) # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy # of the network and measures the effect of that perturbation in action space. If the perturbation # is too big, reduce scale of perturbation, otherwise increase. q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func") perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func") kl = tf.reduce_sum(tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), axis=-1) mean_kl = tf.reduce_mean(kl) def update_scale(): with tf.control_dependencies([perturb_for_adaption]): update_scale_expr = tf.cond( mean_kl < param_noise_threshold, lambda: param_noise_scale.assign(param_noise_scale * 1.01), lambda: param_noise_scale.assign(param_noise_scale / 1.01), ) return update_scale_expr # Functionality to update the threshold for parameter space noise. update_param_noise_threshold_expr = param_noise_threshold.assign( tf.cond(update_param_noise_threshold_ph >= 0, lambda: update_param_noise_threshold_ph, lambda: param_noise_threshold)) # Put everything together. deterministic_actions = tf.argmax(q_values_perturbed, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) rl_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) predicted_feedback = hr_func(observations_ph.get(), num_actions, scope="hr_func") fb_logit_constant = 10 hr_pdtype = CategoricalPdType(num_actions) hr_pd = hr_pdtype.pdfromflat(predicted_feedback * fb_logit_constant) hr_actions = hr_pd.sample() chose_rl = tf.random_uniform( tf.stack([batch_size ]), minval=0, maxval=1, dtype=tf.float32) < rl_importance output_actions = tf.where(chose_rl, rl_actions, hr_actions) update_eps_expr = eps.assign( tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) update_rl_importance_expr = rl_importance.assign( tf.cond(update_rl_importance_ph >= 0, lambda: update_rl_importance_ph, lambda: rl_importance)) updates = [ update_eps_expr, tf.cond( reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), lambda: tf.group(*[])), tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)), update_param_noise_threshold_expr, update_rl_importance_expr, ] _act = U.function(inputs=[ observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph, update_rl_importance_ph ], outputs=output_actions, givens={ update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False, update_rl_importance_ph: -1.0 }, updates=updates) def act(ob, reset=False, update_param_noise_threshold=False, update_param_noise_scale=False, stochastic=True, update_eps=-1, update_rl_importance=-1): return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale, update_rl_importance) return act
def define_rew_discriminator_v2(self, convfeat, rep_size, use_rew=False): output_shape = [self.sy_nenvs * (self.sy_nsteps - 1)] sample_prob = tf.reshape(self.sample_agent_prob, tf.stack(output_shape)) game_score = tf.reshape( self.game_score, tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1])) rew_agent_label = tf.reshape( self.rew_agent_label, tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1])) #rew_agent_label = tf.one_hot(self.rew_agent_label, self.num_agents, axis=-1) #rew_agent_label = tf.reshape(rew_agent_label,(-1,self.num_agents )) for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C phi = ph[:, 1:] phi = tf.cast(phi, tf.float32) phi = tf.reshape(phi, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] phi = phi / 255. last_rew_ob = self.last_rew_ob last_rew_ob = tf.cast(last_rew_ob, tf.float32) last_rew_ob = tf.reshape( last_rew_ob, (-1, *last_rew_ob.shape.as_list()[-3:]))[:, :, :, -1:] last_rew_ob = last_rew_ob / 255. if use_rew: phi = tf.concat([phi, last_rew_ob], axis=-1) phi = tf.nn.leaky_relu( conv(phi, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) #[20,20] [8,8] phi = tf.nn.leaky_relu( conv(phi, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) #[9,9] [7,7] phi = tf.nn.leaky_relu( conv(phi, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) phi = to2d(phi) phi = tf.nn.relu( fc(phi, 'fc1r', nh=rep_size, init_scale=np.sqrt(2))) phi = tf.nn.relu( fc(phi, 'fc2r', nh=rep_size, init_scale=np.sqrt(2))) disc_logits = fc(phi, 'fc3r', nh=self.num_agents, init_scale=np.sqrt(2)) one_hot_gidx = tf.one_hot(self.ph_agent_idx, self.num_agents, axis=-1) one_hot_gidx = tf.reshape(one_hot_gidx, (-1, self.num_agents)) flatten_all_div_prob = tf.nn.softmax(disc_logits, axis=-1) all_div_prob = tf.reshape( flatten_all_div_prob, (self.sy_nenvs, self.sy_nsteps - 1, self.num_agents)) sp_prob = tf.reduce_sum(one_hot_gidx * flatten_all_div_prob, axis=1) sp_prob = tf.reshape(sp_prob, (self.sy_nenvs, self.sy_nsteps - 1)) div_rew = -1 * tf.nn.softmax_cross_entropy_with_logits_v2( logits=disc_logits, labels=one_hot_gidx) base_rew = tf.log(0.01) div_rew = div_rew - tf.log(sample_prob) div_rew = tf.reshape(div_rew, (self.sy_nenvs, self.sy_nsteps - 1)) disc_pdtype = CategoricalPdType(self.num_agents) disc_pd = disc_pdtype.pdfromflat(disc_logits) disc_nlp = disc_pd.neglogp(rew_agent_label) return disc_logits, all_div_prob, sp_prob, div_rew, disc_pd, disc_nlp