def build_graph(self, weights, loss=None, optimizer=None, norm=False, batch_size=None, grad_ys=None): if loss is not None: gradients = tf.gradients(loss.node, list(utils.Utils.flatten(weights.node)), grad_ys) gradients = [ tf.check_numerics(g, 'gradient_%d' % i) for i, g in enumerate(gradients) ] if batch_size is not None: gradients = [g / float(batch_size) for g in gradients] # store gradients global norm before clipping self.global_norm = tf.global_norm(gradients) # clip gradients after global norm has been stored if norm: gradients, _ = tf.clip_by_global_norm(gradients, norm) self.calculate = graph.TfNode( utils.Utils.reconstruct(gradients, weights.node)) if optimizer is not None: self.ph_gradients = graph.Placeholders(weights) self.apply = graph.TfNode( optimizer.node.apply_gradients( utils.Utils.izip(self.ph_gradients.checked, weights.node)))
def build_graph(self, x, batch_size=1, n_units=256): self.phs = [ graph.Placeholder(np.float32, [batch_size, n_units]) for _ in range(2) ] self.ph_state = graph.TfNode(tuple(ph.node for ph in self.phs)) self.ph_state.checked = tuple(ph.checked for ph in self.phs) self.zero_state = tuple( np.zeros([batch_size, n_units]) for _ in range(2)) state = tf.contrib.rnn.LSTMStateTuple(*self.ph_state.checked) lstm = tf.contrib.rnn.BasicLSTMCell(n_units, state_is_tuple=True) outputs, self.state = tf.nn.dynamic_rnn(lstm, x.node, initial_state=state, sequence_length=tf.shape( x.node)[1:2], time_major=False) self.state = graph.TfNode(self.state) self.weight = graph.TfNode( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)) return outputs
def build_graph(self, *layers): weights = [layer.weight.node for layer in layers] self.ph_weights = graph.Placeholders(variables=graph.TfNode(weights)) self.assign = graph.TfNode([tf.assign(variable, value) for variable, value in utils.Utils.izip(weights, self.ph_weights.checked)]) self.check = graph.TfNode(tf.group(*[tf.check_numerics(w, 'weight_%d' % i) for i, w in enumerate(utils.Utils.flatten(weights))])) self.global_norm = tf.global_norm(list(utils.Utils.flatten(weights))) return weights
def build_graph(self): super(_WorkerNetwork, self).__init__() self.lstm = CustomBasicLSTMCell(cfg.d) # d=256 # needs wrap as layer to retrieve weights self.ph_goal =\ graph.Placeholder(np.float32, shape=(None, cfg.d), name="ph_goal") # self.ph_goal = tf.placeholder(tf.float32, [None, cfg.d], name="ph_goal") perception_expanded = graph.Expand(self.perception.node, 0) self.ph_step_size = \ graph.Placeholder(np.float32, shape=(1,), name="ph_w_step_size") # tf.placeholder(tf.float32, [1], name="ph_w_step_size") self.ph_initial_lstm_state = \ graph.Placeholder(np.float32, shape=(1, self.lstm.state_size), name="ph_w_lstm_state") # tf.placeholder(tf.float32, [1, self.lstm.state_size], name="ph_w_lstm_state") lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn( self.lstm, perception_expanded, initial_state=self.ph_initial_lstm_state, sequence_length=self.ph_step_size, time_major=False) lstm_outputs = tf.reshape(lstm_outputs, [-1, cfg.d]) sg_lstm_outputs = graph.TfNode(lstm_outputs) U = layer.LinearLayer(sg_lstm_outputs, shape=(cfg.d, cfg.action_size * cfg.k), transformation=tf.matmul) U_embedding = tf.transpose(tf.reshape(U, [cfg.action_size, cfg.k, -1])) w = layer.LinearLayer(self.ph_goal, shape=(cfg.d, cfg.k), transformation=tf.matmul, bias=False) w_reshaped = tf.reshape(w.node, [-1, 1, cfg.k]) self.pi = layer.MatmulLayer(w_reshaped, U_embedding, activation=layer.Activation.Softmax) self.vi = layer.LinearLayer(sg_lstm_outputs, shape=(cfg.d, 1), transformation=tf.matmul) self.weights = layer.Weights( self.weights, graph.TfNode((self.lstm.matrix, self.lstm.bias)), U, w, self.vi) self.lstm_state_out =\ graph.VarAssign(graph.Variable(np.zeros([1, self.lstm.state_size]), dtype=np.float32, name="lstm_state_out"), np.zeros([1, self.lstm.state_size]))
def build_graph(self, x, batch_size, n_units, n_cores): lstm = graph.DilatedLSTMCell(n_units, n_cores) self.ph_state = graph.Placeholder(np.float32, [batch_size, lstm.state_size]) self.zero_state = np.zeros([batch_size, lstm.state_size]) outputs, self.state = tf.nn.dynamic_rnn(lstm, x.node, initial_state=self.ph_state.checked, sequence_length=tf.shape(x.node)[1:2], time_major=False) self.state = graph.TfNode(self.state) self.weight = graph.TfNode([lstm.matrix, lstm.bias]) self.reset_timestep = graph.TfNode(lstm.reset_timestep) return outputs
def build_graph(self): input = layer.ConfiguredInput(trpo_config.config.input) # add one extra feature for timestep ph_step = graph.Placeholder(np.float32, shape=[None, 1]) state = (input.ph_state, ph_step) concatenated = graph.Concat([layer.Flatten(input), ph_step], axis=1) activation = layer.Activation.get_activation( trpo_config.config.activation) head = layer.GenericLayers(concatenated, [ dict(type=layer.Dense, size=size, activation=activation) for size in trpo_config.config.hidden_sizes ]) value = layer.Dense(head, 1) ph_ytarg_ny = graph.Placeholder(np.float32) mse = graph.TfNode( tf.reduce_mean(tf.square(ph_ytarg_ny.node - value.node))) weights = layer.Weights(input, head, value) sg_get_weights_flatten = graph.GetVariablesFlatten(weights) sg_set_weights_flatten = graph.SetVariablesFlatten(weights) l2 = graph.TfNode(1e-3 * tf.add_n([ tf.reduce_sum(tf.square(v)) for v in utils.Utils.flatten(weights.node) ])) loss = graph.TfNode(l2.node + mse.node) sg_gradients = optimizer.Gradients(weights, loss=loss) sg_gradients_flatten = graph.GetVariablesFlatten( sg_gradients.calculate) self.op_value = self.Op(value, state=state) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op( sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) self.op_compute_loss_and_gradient = self.Ops(loss, sg_gradients_flatten, state=state, ytarg_ny=ph_ytarg_ny) self.op_losses = self.Ops(loss, mse, l2, state=state, ytarg_ny=ph_ytarg_ny)
def build_graph(self): sg_network = Network() sg_get_weights_flatten = GetVariablesFlatten(sg_network.weights) sg_set_weights_flatten = SetVariablesFlatten(sg_network.weights) ph_adv_n = graph.TfNode(tf.placeholder(tf.float32, name='adv_n')) sg_probtype = ProbType(trpo_config.config.output.action_size) ph_oldprob_np = sg_probtype.ProbVariable() sg_logp_n = sg_probtype.Loglikelihood(sg_network.actor) sg_oldlogp_n = sg_probtype.Loglikelihood(ph_oldprob_np) sg_surr = graph.TfNode(-tf.reduce_mean(tf.exp(sg_logp_n.node - sg_oldlogp_n.node) * ph_adv_n.node)) sg_sum = tf.reduce_sum(sg_probtype.Kl(graph.TfNode(tf.stop_gradient(sg_network.actor.node)), sg_network.actor).node) sg_factor = tf.cast(tf.shape(sg_network.ph_state.node)[0], tf.float32) sg_kl_first_fixed = graph.TfNode(sg_sum / sg_factor) sg_kl = graph.TfNode(tf.reduce_mean(sg_probtype.Kl(ph_oldprob_np, sg_network.actor).node)) sg_fvp = FisherVectorProduct(sg_kl_first_fixed, sg_network.weights) sg_ent = graph.TfNode(tf.reduce_mean(sg_probtype.Entropy(sg_network.actor).node)) sg_gradients = optimizer.Gradients(sg_network.weights, loss=sg_surr) sg_gradients_flatten = GetVariablesFlatten(sg_gradients.calculate) self.op_get_weights = self.Op(sg_network.weights) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) self.op_compute_gradient = self.Op(sg_gradients_flatten, state=sg_network.ph_state, sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n, oldprob_np=ph_oldprob_np) self.op_losses = self.Ops(sg_surr, sg_kl, sg_ent, state=sg_network.ph_state, sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n, prob_variable=ph_oldprob_np) self.op_fisher_vector_product = self.Op(sg_fvp, tangent=sg_fvp.ph_tangent, state=sg_network.ph_state, sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n, prob_variable=ph_oldprob_np) # PPO clipped surrogate loss # likelihood ration of old and new policy r_theta = tf.exp(sg_logp_n.node - sg_oldlogp_n.node) surr = r_theta * ph_adv_n.node clip_e = trpo_config.config.PPO.clip_e surr_clipped = tf.clip_by_value(r_theta, 1.0 - clip_e, 1.0 + clip_e) * ph_adv_n.node sg_ppo_loss = graph.TfNode(-tf.reduce_mean(tf.minimum(surr, surr_clipped))) sg_minimize = graph.TfNode(tf.train.AdamOptimizer( learning_rate=trpo_config.config.PPO.learning_rate).minimize(sg_ppo_loss.node)) self.op_ppo_optimize = self.Op(sg_minimize, state=sg_network.ph_state, sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n, oldprob_np=ph_oldprob_np)
def build_graph(self, head, output): self.action_size = output.action_size self.continuous = True self.out = Dense(head, self.action_size, activation=Activation.Tanh, init_var=3e-3) self.weight = self.out.weight self.scaled_out = graph.TfNode(self.out.node * output.scale)
def build_graph(self, x1, x2, size=1, activation=Activation.Null): assert len(x1.node.shape) == 2 shape1 = (x1.node.shape.as_list()[1], size) assert len(x2.node.shape) == 2 shape2 = (x2.node.shape.as_list()[1], size) d = 1.0 p = np.prod(shape1[:-1]) if p != 0: d = 1.0 / np.sqrt(p) initializer = graph.RandomUniformInitializer(minval=-d, maxval=d) W1 = graph.Variable(initializer(np.float32, shape1)).node d = 1.0 p = np.prod(shape2[:-1]) if p != 0: d = 1.0 / np.sqrt(p) initializer = graph.RandomUniformInitializer(minval=-d, maxval=d) W2 = graph.Variable(initializer(np.float32, shape2)).node initializer = graph.RandomUniformInitializer() b = graph.Variable(initializer(np.float32, shape2[-1:])).node activation = activation(tf.matmul(x1.node, W1) + tf.matmul(x2.node, W2) + b) self.weight = graph.TfNode((W1, W2, b, activation.weight)) return activation.node
def build_graph(self): # Build graph state = graph.Placeholder(np.float32, shape=(2, )) reverse = graph.TfNode(tf.reverse(state.node, [0])) # Expose public API self.op_get_action = self.Op(reverse, state=state)
def build_graph(self, x): input_dim = x.node.get_shape().as_list()[1] logstd = tf.Variable(tf.zeros(input_dim, tf.float32)) std = tf.tile(tf.reshape(tf.exp(logstd), [1, -1]), (tf.shape(x.node)[0], 1)) self.weight = graph.TfNode(logstd) return tf.concat([x.node, std], axis=1)
def build_graph(self): self.ph_perception =\ graph.Placeholder(np.float32, shape=(None, cfg.d), name="ph_perception") # tf.placeholder(tf.float32, shape=[None, cfg.d], name="ph_perception") self.Mspace =\ layer.Dense(self.ph_perception, cfg.d, # d=256 activation=layer.Activation.Relu) Mspace_expanded = graph.Expand(self.Mspace, 0) self.lstm = DilatedLSTMCell(cfg.d, num_cores=cfg.d) # needs wrap as layer to retrieve weights self.ph_step_size =\ graph.Placeholder(np.float32, shape=(1,), name="ph_m_step_size") # tf.placeholder(tf.float32, [1], name="ph_m_step_size") self.ph_initial_lstm_state =\ graph.Placeholder(np.float32, shape=(1, self.lstm.state_size), name="ph_m_lstm_state") # tf.placeholder(tf.float32, [1, self.lstm.state_size], name="ph_m_lstm_state") lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn( self.lstm, Mspace_expanded, initial_state=self.ph_initial_lstm_state, sequence_length=self.ph_step_size, time_major=False) lstm_outputs = tf.reshape(lstm_outputs, [-1, cfg.d]) sg_lstm_outputs = graph.TfNode(lstm_outputs) self.goal = tf.nn.l2_normalize(graph.Flatten(sg_lstm_outputs), dim=1) critic = layer.Dense(sg_lstm_outputs, 1) self.value = layer.Flatten(critic) self.weights = layer.Weights( self.Mspace, graph.TfNode((self.lstm.matrix, self.lstm.bias)), critic) self.lstm_state_out =\ graph.VarAssign(graph.Variable(np.zeros([1, self.lstm.state_size]), dtype=np.float32, name="lstm_state_out"), np.zeros([1, self.lstm.state_size]))
def build_graph(self, x, shape, transformation, activation, d=None): if d is None: d = 1.0 p = np.prod(shape[:-1]) if p != 0: d = 1.0 / np.sqrt(p) initializer = graph.RandomUniformInitializer(minval=-d, maxval=d) W = graph.Variable(initializer(np.float32, shape)).node b = graph.Variable(initializer(np.float32, shape[-1:])).node self.weight = graph.TfNode((W, b)) return activation(transformation(x, W) + b)
def build_graph(self): input_size, = trpo_config.config.input.shape # add one extra feature for timestep ph_state = graph.Placeholder(np.float32, shape=(None, input_size + 1)) activation = layer.Activation.get_activation(trpo_config.config.activation) descs = [dict(type=layer.Dense, size=size, activation=activation) for size in trpo_config.config.hidden_sizes] descs.append(dict(type=layer.Dense, size=1)) value = layer.GenericLayers(ph_state, descs) ph_ytarg_ny = graph.Placeholder(np.float32) mse = graph.TfNode(tf.reduce_mean(tf.square(ph_ytarg_ny.node - value.node))) weights = layer.Weights(value) sg_get_weights_flatten = GetVariablesFlatten(weights) sg_set_weights_flatten = SetVariablesFlatten(weights) l2 = graph.TfNode(1e-3 * tf.add_n([tf.reduce_sum(tf.square(v)) for v in utils.Utils.flatten(weights.node)])) loss = graph.TfNode(l2.node + mse.node) sg_gradients = optimizer.Gradients(weights, loss=loss) sg_gradients_flatten = GetVariablesFlatten(sg_gradients.calculate) self.op_value = self.Op(value, state=ph_state) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) self.op_compute_loss_and_gradient = self.Ops(loss, sg_gradients_flatten, state=ph_state, ytarg_ny=ph_ytarg_ny) self.op_losses = self.Ops(loss, mse, l2, state=ph_state, ytarg_ny=ph_ytarg_ny)
def build_graph(self): input = layer.ConfiguredInput(config.input) hidden = layer.GenericLayers(layer.Flatten(input), [ dict(type=layer.Dense, size=size, activation=layer.Activation.Tanh) for size in config.hidden_sizes ]) weights = [input, hidden] if config.dueling_dqn: if config.hidden_sizes: v_input, a_input = tf.split(hidden.node, [ config.hidden_sizes[-1] // 2, config.hidden_sizes[-1] // 2 ], axis=1) v_input = graph.TfNode(v_input) a_input = graph.TfNode(a_input) else: v_input, a_input = hidden, hidden v_output = layer.Dense(v_input, 1) a_output = layer.Dense(a_input, config.output.action_size) output = v_output.node + a_output.node - tf.reduce_mean( a_output.node, axis=1, keep_dims=True) output = graph.TfNode(output) weights.extend([v_output, a_output]) else: output = layer.Dense(hidden, config.output.action_size) weights.append(output) self.ph_state = input.ph_state self.output = output self.weights = layer.Weights(*weights)
def build_graph(self): conv_layer = dict(type=layer.Convolution, activation=layer.Activation.Elu, n_filters=32, filter_size=[3, 3], stride=[2, 2], border=layer.Border.Same) input = layer.Input(cfg.config.input, descs=[dict(conv_layer)] * 4) shape = [None] + [cfg.config.output.action_size] self.ph_probs = graph.Placeholder(np.float32, shape=shape, name='act_probs') self.ph_taken = graph.Placeholder(np.int32, shape=(None,), name='act_taken') flattened_input = layer.Flatten(input) last_size = flattened_input.node.shape.as_list()[-1] inverse_inp = graph.Reshape(input, [-1, last_size*2]) get_first = graph.TfNode(inverse_inp.node[:, :last_size]) get_second = graph.TfNode(inverse_inp.node[:, last_size:]) forward_inp = graph.Concat([get_first, self.ph_probs], axis=1) fc_size = cfg.config.hidden_sizes[-1] inv_fc1 = layer.Dense(inverse_inp, fc_size, layer.Activation.Relu) inv_fc2 = layer.Dense(inv_fc1, shape[-1]) # layer.Activation.Softmax fwd_fc1 = layer.Dense(forward_inp, fc_size, layer.Activation.Relu) fwd_fc2 = layer.Dense(fwd_fc1, last_size) inv_loss = graph.SparseSoftmaxCrossEntropyWithLogits(inv_fc2, self.ph_taken).op fwd_loss = graph.L2loss(fwd_fc2.node - get_second.node).op self.ph_state = input.ph_state # should be even wrt to batch_size for now self.rew_out = graph.TfNode(cfg.config.icm.nu * fwd_loss) self.loss = graph.TfNode(cfg.config.icm.beta * fwd_loss + (1 - cfg.config.icm.beta) * inv_loss) layers = [input, inv_fc1, inv_fc2, fwd_fc1, fwd_fc2] self.weights = layer.Weights(*layers)
def build_graph(self): # Build graph sg_network = Network() self.actor = sg_network.actor self.critic = sg_network.critic sg_loss = loss.DA3CLoss(sg_network.actor.head, sg_network.critic.head, da3c_config.config) sg_actor_gradients = optimizer.Gradients( sg_network.actor.weights, loss=graph.TfNode(sg_loss.policy_loss), norm=da3c_config.config.gradients_norm_clipping) sg_critic_gradients = optimizer.Gradients( sg_network.critic.weights, loss=graph.TfNode(sg_loss.value_loss), norm=da3c_config.config.gradients_norm_clipping) if da3c_config.config.use_icm: sg_icm_network = icm_model.ICM() sg_icm_gradients = optimizer.Gradients(sg_icm_network.weights, loss=sg_icm_network.loss) # Expose ICM public API self.op_icm_assign_weights = self.Op( sg_icm_network.weights.assign, weights=sg_icm_network.weights.ph_weights) feeds = dict(state=sg_icm_network.ph_state, probs=sg_icm_network.ph_probs) self.op_get_intrinsic_reward = self.Ops(sg_icm_network.rew_out, **feeds) feeds.update(dict(action=sg_icm_network.ph_taken)) self.op_compute_icm_gradients = self.Op(sg_icm_gradients.calculate, **feeds) summaries = tf.summary.merge([ tf.summary.scalar('policy_loss', sg_loss.policy_loss), tf.summary.scalar('value_loss', sg_loss.value_loss), tf.summary.scalar('entropy', sg_loss.entropy), tf.summary.scalar('actor_gradients_global_norm', sg_actor_gradients.global_norm), tf.summary.scalar('critic_gradients_global_norm', sg_critic_gradients.global_norm), tf.summary.scalar('actor_weights_global_norm', sg_network.actor.weights.global_norm), tf.summary.scalar('critic_weights_global_norm', sg_network.critic.weights.global_norm) ]) # Expose public API self.op_assign_weights = self.Ops( sg_network.actor.weights.assign, sg_network.critic.weights.assign, weights=(sg_network.actor.weights.ph_weights, sg_network.critic.weights.ph_weights)) feeds = dict(state=sg_network.ph_state, action=sg_loss.ph_action, advantage=sg_loss.ph_advantage, discounted_reward=sg_loss.ph_discounted_reward) if da3c_config.config.use_lstm: feeds.update( dict(lstm_state=(sg_network.actor.ph_lstm_state, sg_network.critic.ph_lstm_state))) self.lstm_zero_state = (sg_network.actor.lstm_zero_state, sg_network.critic.lstm_zero_state) self.op_lstm_reset_timestep = self.Ops( sg_network.actor.lstm_reset_timestep, sg_network.critic.lstm_reset_timestep) self.op_get_action_value_and_lstm_state = \ self.Ops(sg_network.actor.head, sg_network.critic.head, (sg_network.actor.lstm_state, sg_network.critic.lstm_state), state=sg_network.ph_state, lstm_state=(sg_network.actor.ph_lstm_state, sg_network.critic.ph_lstm_state)) else: self.op_get_action_and_value = self.Ops(sg_network.actor.head, sg_network.critic.head, state=sg_network.ph_state) self.op_compute_gradients_and_summaries = \ self.Ops((sg_actor_gradients.calculate, sg_critic_gradients.calculate), summaries, **feeds)
def build_graph(self, sg_value_net): # 'Observed' value of a state = discounted reward vf_scale = dppo_config.config.critic_scale ph_ytarg_ny = graph.Placeholder(np.float32) v1_loss = graph.TfNode(tf.square(sg_value_net.head.node - ph_ytarg_ny.node)) if dppo_config.config.vf_clipped_loss: ph_old_vpred = graph.Placeholder(np.float32) clip_e = dppo_config.config.clip_e vpredclipped = ph_old_vpred.node + tf.clip_by_value(sg_value_net.head.node - ph_old_vpred.node, -clip_e, clip_e) v2_loss = graph.TfNode(tf.square(vpredclipped - ph_ytarg_ny.node)) vf_mse = graph.TfNode(vf_scale * tf.reduce_mean(tf.maximum(v2_loss.node, v1_loss.node))) else: vf_mse = graph.TfNode(vf_scale * tf.reduce_mean(v1_loss.node)) if dppo_config.config.l2_coeff is not None: l2 = graph.TfNode(dppo_config.config.l2_coeff * tf.add_n([tf.reduce_sum(tf.square(v)) for v in utils.Utils.flatten(sg_value_net.weights.node)])) sg_vf_total_loss = graph.TfNode(l2.node + vf_mse.node) else: sg_vf_total_loss = vf_mse sg_gradients = optimizer.Gradients(sg_value_net.weights, loss=sg_vf_total_loss, norm=dppo_config.config.gradients_norm_clipping) sg_gradients_flatten = graph.GetVariablesFlatten(sg_gradients.calculate) # Op to compute value of a state if dppo_config.config.use_lstm: self.op_value = self.Ops(sg_value_net.head, sg_value_net.lstm_state, state=sg_value_net.ph_state, lstm_state=sg_value_net.ph_lstm_state) self.op_lstm_reset_timestep = self.Op(sg_value_net.lstm_reset_timestep) else: self.op_value = self.Op(sg_value_net.head, state=sg_value_net.ph_state) self.op_get_weights = self.Op(sg_value_net.weights) self.op_assign_weights = self.Op(sg_value_net.weights.assign, weights=sg_value_net.weights.ph_weights) sg_get_weights_flatten = graph.GetVariablesFlatten(sg_value_net.weights) sg_set_weights_flatten = graph.SetVariablesFlatten(sg_value_net.weights) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) feeds = dict(state=sg_value_net.ph_state, ytarg_ny=ph_ytarg_ny) if dppo_config.config.use_lstm: feeds.update(dict(lstm_state=sg_value_net.ph_lstm_state)) if dppo_config.config.vf_clipped_loss: feeds.update(dict(vpred_old=ph_old_vpred)) self.op_compute_gradients = self.Op(sg_gradients.calculate, **feeds) if dppo_config.config.use_lstm: self.op_compute_gradients = self.Ops(sg_gradients.calculate, sg_value_net.lstm_state, **feeds) self.op_compute_loss_and_gradient_flatten = self.Ops(sg_vf_total_loss, sg_gradients_flatten, **feeds) losses = [sg_vf_total_loss, vf_mse] if dppo_config.config.l2_coeff is not None: losses.append(l2) self.op_losses = self.Ops(*losses, **feeds) # Init Op for all weights sg_initialize = graph.Initialize() self.op_initialize = self.Op(sg_initialize)
def build_graph(self, sg_network): if dppo_config.config.use_lstm: self.op_get_action = self.Ops(sg_network.head, sg_network.lstm_state, state=sg_network.ph_state, lstm_state=sg_network.ph_lstm_state) self.op_lstm_reset_timestep = self.Op(sg_network.lstm_reset_timestep) else: self.op_get_action = self.Op(sg_network.head, state=sg_network.ph_state) # Advantage node ph_adv_n = graph.TfNode(tf.placeholder(tf.float32, name='adv_n')) # Contains placeholder for the actual action made by the agent sg_probtype = ProbType(dppo_config.config.output.action_size, continuous=dppo_config.config.output.continuous) # Placeholder to store action probabilities under the old policy ph_oldprob_np = sg_probtype.ProbVariable() sg_logp_n = sg_probtype.Loglikelihood(sg_network.head) sg_oldlogp_n = sg_probtype.Loglikelihood(ph_oldprob_np) # PPO clipped surrogate loss # likelihood ratio of old and new policy r_theta = tf.exp(sg_logp_n.node - sg_oldlogp_n.node) surr = r_theta * ph_adv_n.node clip_e = dppo_config.config.clip_e surr_clipped = tf.clip_by_value(r_theta, 1.0 - clip_e, 1.0 + clip_e) * ph_adv_n.node sg_pol_clip_loss = graph.TfNode(-tf.reduce_mean(tf.minimum(surr, surr_clipped))) # PPO entropy loss if dppo_config.config.entropy is not None: sg_entropy = sg_probtype.Entropy(sg_network.head) sg_ent_loss = (-dppo_config.config.entropy) * tf.reduce_mean(sg_entropy.node) sg_pol_total_loss = graph.TfNode(sg_pol_clip_loss.node + sg_ent_loss) else: sg_pol_total_loss = sg_pol_clip_loss # Regular gradients sg_ppo_clip_gradients = optimizer.Gradients(sg_network.weights, loss=sg_pol_total_loss, norm=dppo_config.config.gradients_norm_clipping) feeds = dict(state=sg_network.ph_state, action=sg_probtype.ph_sampled_variable, advantage=ph_adv_n, old_prob=ph_oldprob_np) if dppo_config.config.use_lstm: feeds.update(dict(lstm_state=sg_network.ph_lstm_state)) self.op_compute_ppo_clip_gradients = self.Op(sg_ppo_clip_gradients.calculate, **feeds) if dppo_config.config.use_lstm: self.op_compute_ppo_clip_gradients = self.Ops(sg_ppo_clip_gradients.calculate, sg_network.lstm_state, **feeds) # Weights get/set for updating the policy sg_get_weights_flatten = graph.GetVariablesFlatten(sg_network.weights) sg_set_weights_flatten = graph.SetVariablesFlatten(sg_network.weights) self.op_get_weights = self.Op(sg_network.weights) self.op_assign_weights = self.Op(sg_network.weights.assign, weights=sg_network.weights.ph_weights) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) # Init Op for all weights sg_initialize = graph.Initialize() self.op_initialize = self.Op(sg_initialize)
def build_graph(self, d): self._d = d self.ph_sampled_variable = graph.TfNode(tf.placeholder(tf.float32, name='a'))
def build_graph(self, n): self._n = n self.ph_sampled_variable = graph.TfNode(tf.placeholder(tf.int32, name='a'))