def __init__(self,sess,p, train_phase=True,has_state = False): with tf.variable_scope("model",reuse = train_phase) as scope: # Reuse = true for training phase # Initialization of placeholders X = tf.placeholder(tf.uint8, p.OBS_SHAPE) #obs S = tf.placeholder(tf.float32,p.STATE_SHAPE) scaled_x = tf.cast(X, tf.float32) / 255. # Additional Functions which may be needed relu_activ = tf.nn.relu #Relu Activation normalize = lambda layer,phase : tf.layers.batch_normalization(layer, center=True,scale=True, training=train_phase) # Batch Normalization # Model Details #h1 = relu_activ(conv(scaled_x,scope = 'conv1', nf = 10, rf = 5, stride = 1,init_scale=np.sqrt(2))) #h2 = relu_activ(conv(h1,scope = 'conv2', nf = 10, rf = 3, stride = 1)) flattened_x = conv_to_fc(scaled_x) h1 = relu_activ(fc(flattened_x,scope = 'fc1', nh = 20,init_scale=np.sqrt(2))) h2 = relu_activ(fc(h1,scope = 'fc2', nh = 15,init_scale=np.sqrt(2))) hconcat = tf.concat([h2,S],axis=1) h3 = relu_activ(fc(hconcat,scope = 'fc3', nh = 10,init_scale=np.sqrt(2))) hcommon = relu_activ(fc(h3,scope = 'fcommon', nh = 10,init_scale=np.sqrt(2))) pi = fc(hcommon, scope = "policy" , nh = 3,init_scale=0.01) vf = fc(hcommon, scope = "value" , nh = 1) self.pd_type = CategoricalPdType(p.NUM_ACTIONS) self.pd = self.pd_type.pdfromflat(pi) # Sampling from action distribution as per baselines # Sample from the distribution v0 = vf[:, 0] # To remove extra dimension a0 = self.pd.sample() # Sample from distribution neglogp0 = self.pd.neglogp(a0) #Self entropy of selected action self.initial_state = None # Not required for CNN (only for RNN Models) # Interfaces to the outer world def step(ob, state, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob,S:state}) return a, v, neglogp def value(ob,state, *_args, **_kwargs): return sess.run(v0, {X:ob,S:state}) def hidden_value(ob,state,*_args, **_kwargs): """ Created for debugging purposes """ #amodel = np.argmax(np.array(sess.run([pi], {X:ob,S:state})).flatten()) #a = sess.run([a0], {X:ob,S:state}) #adict = {"amodel":amodel,"asampler":a} return sess.run([hcommon], {X:ob,S:state}) self.pi = pi self.vf = vf self.X = X self.S = S self.step = step self.value = value self.hidden_value = hidden_value # Required for debugging purpose
def make_pdtype(ac_space): from cadm import spaces as custom_spaces from gym import spaces if isinstance(ac_space, custom_spaces.Box): assert len(ac_space.shape) == 1 return DiagGaussianPdType(ac_space.shape[0]) elif isinstance(ac_space, spaces.Box): assert len(ac_space.shape) == 1 return DiagGaussianPdType(ac_space.shape[0]) elif isinstance(ac_space, spaces.Discrete): return CategoricalPdType(ac_space.n) elif isinstance(ac_space, spaces.MultiDiscrete): return MultiCategoricalPdType(ac_space.nvec) elif isinstance(ac_space, spaces.MultiBinary): return BernoulliPdType(ac_space.n) else: raise NotImplementedError
def _build(self): num_primitives = self.num_primitives num_hid_layers = self._num_hid_layers hid_size = self._hid_size self._obs = {} for ob_name, ob_shape in self._ob_shape.items(): self._obs[ob_name] = U.get_placeholder( name="ob_{}".format(ob_name), dtype=tf.float32, shape=[None] + self._ob_shape[ob_name]) self._prev_primitive = prev_primitive = U.get_placeholder( name="prev_primitive", dtype=tf.int32, shape=[None]) with tf.variable_scope(self.name): self._scope = tf.get_variable_scope().name self.ob_rms = {} for ob_name in self.ob_type: with tf.variable_scope("ob_rms_{}".format(ob_name)): self.ob_rms[ob_name] = RunningMeanStd( shape=self._ob_shape[ob_name]) obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) / self.ob_rms[ob_name].std for ob_name in self.ob_type] obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz] obz = tf.concat(obz, -1) prev_primitive_one_hot = tf.one_hot(prev_primitive, num_primitives, name="prev_primitive_one_hot") obz = tf.concat([obz, prev_primitive_one_hot], -1) # value function with tf.variable_scope("vf"): _ = obz for i in range(num_hid_layers): _ = self._activation( tf.layers.dense( _, hid_size, name="fc%d" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( _, 1, name="vpred", kernel_initializer=U.normc_initializer(1.0))[:, 0] # meta policy with tf.variable_scope("pol"): _ = obz for i in range(num_hid_layers): _ = self._activation( tf.layers.dense( _, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.selector = tf.layers.dense( _, num_primitives, name="action", kernel_initializer=U.normc_initializer(0.01)) self.pdtype = pdtype = CategoricalPdType(num_primitives) self.pd = pdtype.pdfromflat(self.selector) # sample action stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.obs = [self._obs[ob_name] for ob_name in self.ob_type] self._act = U.function([stochastic, self._prev_primitive] + self.obs, [ac, self.vpred])
def _init(self, ob_space, ac_space, kind, atom_type_num, args): self.pdtype = MultiCatCategoricalPdType ### 0 Get input ob = { 'adj': U.get_placeholder( name="adj", dtype=tf.float32, shape=[None, ob_space['adj'].shape[0], None, None]), 'node': U.get_placeholder(name="node", dtype=tf.float32, shape=[None, 1, None, ob_space['node'].shape[2]]) } # only when evaluating given action, at training time self.ac_real = U.get_placeholder(name='ac_real', dtype=tf.int64, shape=[None, 4]) # feed groudtruth action ob_node = tf.compat.v1.layers.dense(ob['node'], 8, activation=None, use_bias=False, name='emb') # embedding layer if args.bn == 1: ob_node = tf.compat.v1.layers.batch_normalization(ob_node, axis=-1) if args.has_concat == 1: emb_node = tf.concat( (GCN_batch(ob['adj'], ob_node, args.emb_size, name='gcn1', aggregate=args.gcn_aggregate), ob_node), axis=-1) else: emb_node = GCN_batch(ob['adj'], ob_node, args.emb_size, name='gcn1', aggregate=args.gcn_aggregate) if args.bn == 1: emb_node = tf.compat.v1.layers.batch_normalization(emb_node, axis=-1) for i in range(args.layer_num_g - 2): if args.has_residual == 1: emb_node = GCN_batch( ob['adj'], emb_node, args.emb_size, name='gcn1_' + str(i + 1), aggregate=args.gcn_aggregate) + self.emb_node1 elif args.has_concat == 1: emb_node = tf.concat( (GCN_batch(ob['adj'], emb_node, args.emb_size, name='gcn1_' + str(i + 1), aggregate=args.gcn_aggregate), self.emb_node1), axis=-1) else: emb_node = GCN_batch(ob['adj'], emb_node, args.emb_size, name='gcn1_' + str(i + 1), aggregate=args.gcn_aggregate) if args.bn == 1: emb_node = tf.compat.v1.layers.batch_normalization(emb_node, axis=-1) emb_node = GCN_batch(ob['adj'], emb_node, args.emb_size, is_act=False, is_normalize=(args.bn == 0), name='gcn2', aggregate=args.gcn_aggregate) emb_node = tf.squeeze(emb_node, axis=1) # B*n*f ### 1 only keep effective nodes # ob_mask = tf.cast(tf.transpose(tf.reduce_sum(ob['node'],axis=-1),[0,2,1]),dtype=tf.bool) # B*n*1 ob_len = tf.reduce_sum(tf.squeeze(tf.cast(tf.cast(tf.reduce_sum( ob['node'], axis=-1), dtype=tf.bool), dtype=tf.float32), axis=-2), axis=-1) # B ob_len_first = ob_len - atom_type_num logits_mask = tf.sequence_mask(ob_len, maxlen=tf.shape( ob['node'])[2]) # mask all valid entry logits_first_mask = tf.sequence_mask( ob_len_first, maxlen=tf.shape( ob['node'])[2]) # mask valid entry -3 (rm isolated nodes) if args.mask_null == 1: emb_node_null = tf.zeros(tf.shape(emb_node)) emb_node = tf.where(condition=tf.tile( tf.expand_dims(logits_mask, axis=-1), (1, 1, emb_node.get_shape()[-1])), x=emb_node, y=emb_node_null) ## get graph embedding emb_graph = tf.reduce_sum(emb_node, axis=1, keepdims=True) if args.graph_emb == 1: emb_graph = tf.tile(emb_graph, [1, tf.shape(emb_node)[1], 1]) emb_node = tf.concat([emb_node, emb_graph], axis=2) ### 2 predict stop emb_stop = tf.compat.v1.layers.dense(emb_node, args.emb_size, activation=tf.nn.relu, use_bias=False, name='linear_stop1') if args.bn == 1: emb_stop = tf.compat.v1.layers.batch_normalization(emb_stop, axis=-1) self.logits_stop = tf.reduce_sum(emb_stop, axis=1) self.logits_stop = tf.compat.v1.layers.dense( self.logits_stop, 2, activation=None, name='linear_stop2_1') # B*2 # explicitly show node num # self.logits_stop = tf.concat((tf.reduce_mean(tf.compat.v1.layers.dense(emb_node, 32, activation=tf.nn.relu, name='linear_stop1'),axis=1),tf.reshape(ob_len_first/5,[-1,1])),axis=1) # self.logits_stop = tf.compat.v1.layers.dense(self.logits_stop, 2, activation=None, name='linear_stop2') # B*2 stop_shift = tf.constant([[0, args.stop_shift]], dtype=tf.float32) pd_stop = CategoricalPdType(-1).pdfromflat(flat=self.logits_stop + stop_shift) ac_stop = pd_stop.sample() ### 3.1: select first (active) node # rules: only select effective nodes self.logits_first = tf.compat.v1.layers.dense(emb_node, args.emb_size, activation=tf.nn.relu, name='linear_select1') self.logits_first = tf.squeeze(tf.compat.v1.layers.dense( self.logits_first, 1, activation=None, name='linear_select2'), axis=-1) # B*n logits_first_null = tf.ones(tf.shape(self.logits_first)) * -1000 self.logits_first = tf.where(condition=logits_first_mask, x=self.logits_first, y=logits_first_null) # using own prediction pd_first = CategoricalPdType(-1).pdfromflat(flat=self.logits_first) ac_first = pd_first.sample() mask = tf.one_hot(ac_first, depth=tf.shape(emb_node)[1], dtype=tf.bool, on_value=True, off_value=False) emb_first = tf.boolean_mask(emb_node, mask) emb_first = tf.expand_dims(emb_first, axis=1) # using groud truth action ac_first_real = self.ac_real[:, 0] mask_real = tf.one_hot(ac_first_real, depth=tf.shape(emb_node)[1], dtype=tf.bool, on_value=True, off_value=False) emb_first_real = tf.boolean_mask(emb_node, mask_real) emb_first_real = tf.expand_dims(emb_first_real, axis=1) ### 3.2: select second node # rules: do not select first node # using own prediction # mlp emb_cat = tf.concat( [tf.tile(emb_first, [1, tf.shape(emb_node)[1], 1]), emb_node], axis=2) self.logits_second = tf.compat.v1.layers.dense(emb_cat, args.emb_size, activation=tf.nn.relu, name='logits_second1') self.logits_second = tf.compat.v1.layers.dense(self.logits_second, 1, activation=None, name='logits_second2') # # bilinear # self.logits_second = tf.transpose(bilinear(emb_first, emb_node, name='logits_second'), [0, 2, 1]) self.logits_second = tf.squeeze(self.logits_second, axis=-1) ac_first_mask = tf.one_hot(ac_first, depth=tf.shape(emb_node)[1], dtype=tf.bool, on_value=False, off_value=True) logits_second_mask = tf.logical_and(logits_mask, ac_first_mask) logits_second_null = tf.ones(tf.shape(self.logits_second)) * -1000 self.logits_second = tf.where(condition=logits_second_mask, x=self.logits_second, y=logits_second_null) pd_second = CategoricalPdType(-1).pdfromflat(flat=self.logits_second) ac_second = pd_second.sample() mask = tf.one_hot(ac_second, depth=tf.shape(emb_node)[1], dtype=tf.bool, on_value=True, off_value=False) emb_second = tf.boolean_mask(emb_node, mask) emb_second = tf.expand_dims(emb_second, axis=1) # using groudtruth # mlp emb_cat = tf.concat( [tf.tile(emb_first_real, [1, tf.shape(emb_node)[1], 1]), emb_node], axis=2) self.logits_second_real = tf.compat.v1.layers.dense( emb_cat, args.emb_size, activation=tf.nn.relu, name='logits_second1', reuse=True) self.logits_second_real = tf.compat.v1.layers.dense( self.logits_second_real, 1, activation=None, name='logits_second2', reuse=True) # # bilinear # self.logits_second_real = tf.transpose(bilinear(emb_first_real, emb_node, name='logits_second'), [0, 2, 1]) self.logits_second_real = tf.squeeze(self.logits_second_real, axis=-1) ac_first_mask_real = tf.one_hot(ac_first_real, depth=tf.shape(emb_node)[1], dtype=tf.bool, on_value=False, off_value=True) logits_second_mask_real = tf.logical_and(logits_mask, ac_first_mask_real) self.logits_second_real = tf.where(condition=logits_second_mask_real, x=self.logits_second_real, y=logits_second_null) ac_second_real = self.ac_real[:, 1] mask_real = tf.one_hot(ac_second_real, depth=tf.shape(emb_node)[1], dtype=tf.bool, on_value=True, off_value=False) emb_second_real = tf.boolean_mask(emb_node, mask_real) emb_second_real = tf.expand_dims(emb_second_real, axis=1) ### 3.3 predict edge type # using own prediction # MLP emb_cat = tf.concat([emb_first, emb_second], axis=-1) self.logits_edge = tf.compat.v1.layers.dense(emb_cat, args.emb_size, activation=tf.nn.relu, name='logits_edge1') self.logits_edge = tf.compat.v1.layers.dense(self.logits_edge, ob['adj'].get_shape()[1], activation=None, name='logits_edge2') self.logits_edge = tf.squeeze(self.logits_edge, axis=1) # # bilinear # self.logits_edge = tf.reshape(bilinear_multi(emb_first,emb_second,out_dim=ob['adj'].get_shape()[1]),[-1,ob['adj'].get_shape()[1]]) pd_edge = CategoricalPdType(-1).pdfromflat(self.logits_edge) ac_edge = pd_edge.sample() # using ground truth # MLP emb_cat = tf.concat([emb_first_real, emb_second_real], axis=-1) self.logits_edge_real = tf.compat.v1.layers.dense( emb_cat, args.emb_size, activation=tf.nn.relu, name='logits_edge1', reuse=True) self.logits_edge_real = tf.compat.v1.layers.dense( self.logits_edge_real, ob['adj'].get_shape()[1], activation=None, name='logits_edge2', reuse=True) self.logits_edge_real = tf.squeeze(self.logits_edge_real, axis=1) # # bilinear # self.logits_edge_real = tf.reshape(bilinear_multi(emb_first_real, emb_second_real, out_dim=ob['adj'].get_shape()[1]), # [-1, ob['adj'].get_shape()[1]]) # ncat_list = [tf.shape(logits_first),ob_space['adj'].shape[-1],ob_space['adj'].shape[0]] self.pd = self.pdtype(-1).pdfromflat([ self.logits_first, self.logits_second_real, self.logits_edge_real, self.logits_stop ]) self.vpred = tf.compat.v1.layers.dense(emb_node, args.emb_size, use_bias=False, activation=tf.nn.relu, name='value1') if args.bn == 1: self.vpred = tf.compat.v1.layers.batch_normalization(self.vpred, axis=-1) self.vpred = tf.reduce_max(self.vpred, axis=1) self.vpred = tf.compat.v1.layers.dense(self.vpred, 1, activation=None, name='value2') self.state_in = [] self.state_out = [] self.ac = tf.concat( (tf.expand_dims(ac_first, axis=1), tf.expand_dims( ac_second, axis=1), tf.expand_dims( ac_edge, axis=1), tf.expand_dims(ac_stop, axis=1)), axis=1) debug = {} debug['ob_node'] = tf.shape(ob['node']) debug['ob_adj'] = tf.shape(ob['adj']) debug['emb_node'] = emb_node debug['logits_stop'] = self.logits_stop debug['logits_second'] = self.logits_second debug['ob_len'] = ob_len debug['logits_first_mask'] = logits_first_mask debug['logits_second_mask'] = logits_second_mask # debug['pd'] = self.pd.logp(self.ac) debug['ac'] = self.ac stochastic = tf.compat.v1.placeholder(dtype=tf.bool, shape=()) self._act = U.function( [stochastic, ob['adj'], ob['node']], [self.ac, self.vpred, debug]) # add debug in second arg if needed
def build_act_with_param_noise(make_obs_ph, q_func, hr_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None): """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905): Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ if param_noise_filter_func is None: param_noise_filter_func = default_param_noise_filter with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") update_param_noise_threshold_ph = tf.placeholder( tf.float32, (), name="update_param_noise_threshold") update_param_noise_scale_ph = tf.placeholder( tf.bool, (), name="update_param_noise_scale") reset_ph = tf.placeholder(tf.bool, (), name="reset") update_rl_importance_ph = tf.placeholder(tf.float32, (), name="update_rl_importance") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) param_noise_scale = tf.get_variable( "param_noise_scale", (), initializer=tf.constant_initializer(0.01), trainable=False) param_noise_threshold = tf.get_variable( "param_noise_threshold", (), initializer=tf.constant_initializer(0.05), trainable=False) rl_importance = tf.get_variable("rl_importance", (), initializer=tf.constant_initializer(0)) # Unmodified Q. q_values = q_func(observations_ph.get(), num_actions, scope="q_func") # Perturbable Q used for the actual rollout. q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func") # We have to wrap this code into a function due to the way tf.cond() works. See # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for # a more detailed discussion. def perturb_vars(original_scope, perturbed_scope): all_vars = scope_vars(absolute_scope_name(original_scope)) all_perturbed_vars = scope_vars( absolute_scope_name(perturbed_scope)) assert len(all_vars) == len(all_perturbed_vars) perturb_ops = [] for var, perturbed_var in zip(all_vars, all_perturbed_vars): if param_noise_filter_func(perturbed_var): # Perturb this variable. op = tf.assign( perturbed_var, var + tf.random_normal(shape=tf.shape(var), mean=0., stddev=param_noise_scale)) else: # Do not perturb, just assign. op = tf.assign(perturbed_var, var) perturb_ops.append(op) assert len(perturb_ops) == len(all_vars) return tf.group(*perturb_ops) # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy # of the network and measures the effect of that perturbation in action space. If the perturbation # is too big, reduce scale of perturbation, otherwise increase. q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func") perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func") kl = tf.reduce_sum(tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), axis=-1) mean_kl = tf.reduce_mean(kl) def update_scale(): with tf.control_dependencies([perturb_for_adaption]): update_scale_expr = tf.cond( mean_kl < param_noise_threshold, lambda: param_noise_scale.assign(param_noise_scale * 1.01), lambda: param_noise_scale.assign(param_noise_scale / 1.01), ) return update_scale_expr # Functionality to update the threshold for parameter space noise. update_param_noise_threshold_expr = param_noise_threshold.assign( tf.cond(update_param_noise_threshold_ph >= 0, lambda: update_param_noise_threshold_ph, lambda: param_noise_threshold)) # Put everything together. deterministic_actions = tf.argmax(q_values_perturbed, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) rl_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) predicted_feedback = hr_func(observations_ph.get(), num_actions, scope="hr_func") fb_logit_constant = 10 hr_pdtype = CategoricalPdType(num_actions) hr_pd = hr_pdtype.pdfromflat(predicted_feedback * fb_logit_constant) hr_actions = hr_pd.sample() chose_rl = tf.random_uniform( tf.stack([batch_size ]), minval=0, maxval=1, dtype=tf.float32) < rl_importance output_actions = tf.where(chose_rl, rl_actions, hr_actions) update_eps_expr = eps.assign( tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) update_rl_importance_expr = rl_importance.assign( tf.cond(update_rl_importance_ph >= 0, lambda: update_rl_importance_ph, lambda: rl_importance)) updates = [ update_eps_expr, tf.cond( reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), lambda: tf.group(*[])), tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)), update_param_noise_threshold_expr, update_rl_importance_expr, ] _act = U.function(inputs=[ observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph, update_rl_importance_ph ], outputs=output_actions, givens={ update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False, update_rl_importance_ph: -1.0 }, updates=updates) def act(ob, reset=False, update_param_noise_threshold=False, update_param_noise_scale=False, stochastic=True, update_eps=-1, update_rl_importance=-1): return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale, update_rl_importance) return act
def build_act(make_obs_ph, q_func, hr_func, num_actions, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") update_rl_importance_ph = tf.placeholder(tf.float32, (), name="update_rl_importance") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) rl_importance = tf.get_variable("rl_importance", (), initializer=tf.constant_initializer(0)) q_values = q_func(observations_ph.get(), num_actions, scope="q_func") deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) rl_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) predicted_feedback = hr_func(observations_ph.get(), num_actions, scope="hr_func") fb_logit_constant = 10 hr_pdtype = CategoricalPdType(num_actions) hr_pd = hr_pdtype.pdfromflat(predicted_feedback * fb_logit_constant) hr_actions = hr_pd.sample() chose_rl = tf.random_uniform( tf.stack([batch_size ]), minval=0, maxval=1, dtype=tf.float32) < rl_importance output_actions = tf.where(chose_rl, rl_actions, hr_actions) update_eps_expr = eps.assign( tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) update_rl_importance_expr = rl_importance.assign( tf.cond(update_rl_importance_ph >= 0, lambda: update_rl_importance_ph, lambda: rl_importance)) _act = U.function(inputs=[ observations_ph, stochastic_ph, update_eps_ph, update_rl_importance_ph ], outputs=output_actions, givens={ update_eps_ph: -1.0, update_rl_importance_ph: -1.0, stochastic_ph: True }, updates=[update_eps_expr, update_rl_importance_expr]) def act(ob, stochastic=True, update_eps=-1, update_rl_importance_expr=-1): return _act(ob, stochastic, update_eps, update_rl_importance_expr) return act
def define_rew_discriminator_v2(self, convfeat, rep_size, use_rew=False): output_shape = [self.sy_nenvs * (self.sy_nsteps - 1)] sample_prob = tf.reshape(self.sample_agent_prob, tf.stack(output_shape)) game_score = tf.reshape( self.game_score, tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1])) rew_agent_label = tf.reshape( self.rew_agent_label, tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1])) #rew_agent_label = tf.one_hot(self.rew_agent_label, self.num_agents, axis=-1) #rew_agent_label = tf.reshape(rew_agent_label,(-1,self.num_agents )) for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C phi = ph[:, 1:] phi = tf.cast(phi, tf.float32) phi = tf.reshape(phi, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] phi = phi / 255. last_rew_ob = self.last_rew_ob last_rew_ob = tf.cast(last_rew_ob, tf.float32) last_rew_ob = tf.reshape( last_rew_ob, (-1, *last_rew_ob.shape.as_list()[-3:]))[:, :, :, -1:] last_rew_ob = last_rew_ob / 255. if use_rew: phi = tf.concat([phi, last_rew_ob], axis=-1) phi = tf.nn.leaky_relu( conv(phi, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) #[20,20] [8,8] phi = tf.nn.leaky_relu( conv(phi, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) #[9,9] [7,7] phi = tf.nn.leaky_relu( conv(phi, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) phi = to2d(phi) phi = tf.nn.relu( fc(phi, 'fc1r', nh=rep_size, init_scale=np.sqrt(2))) phi = tf.nn.relu( fc(phi, 'fc2r', nh=rep_size, init_scale=np.sqrt(2))) disc_logits = fc(phi, 'fc3r', nh=self.num_agents, init_scale=np.sqrt(2)) one_hot_gidx = tf.one_hot(self.ph_agent_idx, self.num_agents, axis=-1) one_hot_gidx = tf.reshape(one_hot_gidx, (-1, self.num_agents)) flatten_all_div_prob = tf.nn.softmax(disc_logits, axis=-1) all_div_prob = tf.reshape( flatten_all_div_prob, (self.sy_nenvs, self.sy_nsteps - 1, self.num_agents)) sp_prob = tf.reduce_sum(one_hot_gidx * flatten_all_div_prob, axis=1) sp_prob = tf.reshape(sp_prob, (self.sy_nenvs, self.sy_nsteps - 1)) div_rew = -1 * tf.nn.softmax_cross_entropy_with_logits_v2( logits=disc_logits, labels=one_hot_gidx) base_rew = tf.log(0.01) div_rew = div_rew - tf.log(sample_prob) div_rew = tf.reshape(div_rew, (self.sy_nenvs, self.sy_nsteps - 1)) disc_pdtype = CategoricalPdType(self.num_agents) disc_pd = disc_pdtype.pdfromflat(disc_logits) disc_nlp = disc_pd.neglogp(rew_agent_label) return disc_logits, all_div_prob, sp_prob, div_rew, disc_pd, disc_nlp
def _init(self, ob_space, ac_space): with tf.variable_scope(self.scope): self.pdtype = pdtype = CategoricalPdType(ac_space) ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space]) out = ob # 进入激活函数前进行batch_normalization,加了4层 out = layers.fully_connected( out, num_outputs=256, activation_fn=None, weights_initializer=U.normc_initializer(1.0)) axes1 = list(range(len(out.get_shape()) - 1)) mean1, variance1 = tf.nn.moments(out, axes1) out = tf.nn.batch_normalization(out, mean1, variance1, offset=None, scale=None, variance_epsilon=0.001) out = tf.nn.relu(out) out = layers.fully_connected( out, num_outputs=128, activation_fn=None, weights_initializer=U.normc_initializer(1.0)) axes2 = list(range(len(out.get_shape()) - 1)) mean2, variance2 = tf.nn.moments(out, axes2) out = tf.nn.batch_normalization(out, mean2, variance2, offset=None, scale=None, variance_epsilon=0.001) out = tf.nn.relu(out) axes4 = list(range(len(out.get_shape()) - 1)) mean4, variance4 = tf.nn.moments(out, axes4) out = tf.nn.batch_normalization(out, mean4, variance4, offset=None, scale=None, variance_epsilon=0.001) self.batch_size = 1 self.time_steps = tf.shape(out)[0] self.cell_size = 128 out = tf.reshape(out, [-1, self.time_steps, self.cell_size], name='2_3D') lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.cell_size, forget_bias=1.0, state_is_tuple=True) state = lstm_cell.zero_state(self.batch_size, tf.float32) out, state = tf.nn.dynamic_rnn(lstm_cell, out, initial_state=state, time_major=False) out = tf.reshape(out, [-1, self.cell_size], name='2_2D') out = tf.nn.dropout(out, keep_prob=0.6) axes3 = list(range(len(out.get_shape()) - 1)) mean3, variance3 = tf.nn.moments(out, axes3) out = tf.nn.batch_normalization(out, mean3, variance3, offset=None, scale=None, variance_epsilon=0.001) out = layers.fully_connected( out, num_outputs=128, activation_fn=tf.nn.relu, weights_initializer=U.normc_initializer(1.0)) pdparam = U.dense(out, pdtype.param_shape()[0], "polfinal") self.vpred = U.dense(out, 1, "value")[:, 0] self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic") update_eps = tf.placeholder(tf.float32, (), name="update_eps") deterministic_actions = self.pd.full_sample( ) # tf.argmax(q_values, axis=1) random_actions = tf.random_uniform(tf.shape(deterministic_actions), minval=-1, maxval=1, dtype=tf.float32) chose_random = tf.random_uniform(tf.shape(deterministic_actions), minval=0, maxval=1, dtype=tf.float32) < update_eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) ac = U.switch(stochastic, stochastic_actions, self.pd.flatparam()) self._act = U.function(inputs=[stochastic, update_eps, ob], outputs=[ac, self.vpred, state], givens={ update_eps: -1.0, stochastic: True })
def _build(self): ac_space = self._ac_space num_hid_layers = self._num_hid_layers hid_size = self._hid_size gaussian_fixed_var = self._gaussian_fixed_var # obs self._obs = {} for ob_name, ob_shape in self._ob_shape.items(): self._obs[ob_name] = U.get_placeholder( name="ob_{}".format(ob_name), dtype=tf.float32, shape=[None] + self._ob_shape[ob_name]) self._cur_primitive = cur_primitive = \ U.get_placeholder(name="cur_primitive", dtype=tf.int32, shape=[None]) # obs normalization self.ob_rms = {} for ob_name in self.ob_type: with tf.variable_scope("ob_rms_{}".format(ob_name)): self.ob_rms[ob_name] = RunningMeanStd( shape=self._ob_shape[ob_name]) obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) / self.ob_rms[ob_name].std for ob_name in self.ob_type] obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz] obz = tf.concat(obz, -1) cur_primitive_one_hot = tf.one_hot(cur_primitive, self._num_primitives, name="cur_primitive_one_hot") obz = tf.concat([obz, cur_primitive_one_hot], -1) # value function with tf.variable_scope("vf"): last_out = obz for i in range(num_hid_layers): last_out = self._activation( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name="final", kernel_initializer=U.normc_initializer(1.0))[:, 0] # primitive policy self.pdtype = pdtype = make_pdtype(ac_space) with tf.variable_scope("pol"): last_out = obz for i in range(num_hid_layers): last_out = self._activation( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name="final", kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name="final", kernel_initializer=U.normc_initializer(0.01)) if self.term_activation == 'sigmoid': self.term_pred = tf.sigmoid( tf.layers.dense( last_out, 1, name="term_final", kernel_initializer=U.normc_initializer(1.0))[:, 0]) stochastic_act = tf.less_equal( (1 / (2 * self._config.trans_term_prob)) * tf.random_uniform(tf.shape(self.term_pred)), self.term_pred) determinstic_act = tf.less_equal( (1 - self._config.trans_term_prob) * tf.ones_like(self.term_pred), self.term_pred) else: self.term_pred = tf.layers.dense( last_out, 2, name="term_final", kernel_initializer=U.normc_initializer(0.01)) self.term_pdtype = term_pdtype = CategoricalPdType(2) self.term_pd = term_pdtype.pdfromflat(self.term_pred) stochastic_act = self.term_pd.sample() determinstic_act = self.term_pd.mode() self.pd = pdtype.pdfromflat(pdparam) # sample action stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.obs = [self._obs[ob_name] for ob_name in self.ob_type] term = U.switch(stochastic, stochastic_act, determinstic_act) self._act = U.function([stochastic, cur_primitive] + self.obs, [ac, self.vpred, term]) self._value = U.function([cur_primitive] + self.obs, self.vpred) self._term_pred = U.function([stochastic, cur_primitive] + self.obs, self.term_pred)