def build_graph(self, actor, critic, cfg): self.ph_action = graph.Placeholder(np.float32, shape=(None, actor.action_size), name="ph_action") self.ph_advantage = graph.Placeholder(np.float32, shape=(None, ), name="ph_adv") self.ph_discounted_reward = graph.Placeholder(np.float32, shape=(None, ), name="ph_edr") mu, sigma2 = actor.node sigma2 += tf.constant(1e-8) log_std_dev = tf.log(sigma2) self.entropy = tf.reduce_mean( log_std_dev + tf.constant(0.5 * np.log(2. * np.pi * np.e), tf.float32)) l2_dist = tf.square(self.ph_action.node - mu) sqr_std_dev = tf.constant(2.) * tf.square(sigma2) + tf.constant(1e-6) log_std_dev = tf.log(sigma2) log_prob = -l2_dist / sqr_std_dev - tf.constant(.5) * tf.log( tf.constant(2 * np.pi)) - log_std_dev self.policy_loss = -(tf.reduce_mean( tf.reduce_sum(log_prob, axis=1) * self.ph_advantage.node) + cfg.entropy_beta * self.entropy) # Learning rate for the Critic is sized by critic_scale parameter self.value_loss = cfg.critic_scale * tf.reduce_mean( tf.square(self.ph_discounted_reward.node - critic.node))
def build_graph(self, actor, critic, entropy=True): self.ph_action = graph.Placeholder(np.int32, shape=(None, ), name="a") self.ph_value = graph.Placeholder(np.float32, shape=(None, ), name="v") self.ph_discounted_reward = graph.Placeholder(np.float32, shape=(None, ), name="r") action_one_hot = tf.one_hot(self.ph_action.node, cfg.action_size) # avoid NaN with getting the maximum with small value log_pi = tf.log(tf.maximum(actor.node, 1e-20)) # policy entropy if entropy: entropy = -tf.reduce_sum(actor.node * log_pi, axis=1) # policy loss (output) (Adding minus, because the original paper's # objective function is for gradient ascent, but we use gradient descent optimizer) policy_loss = -tf.reduce_sum( tf.reduce_sum(log_pi * action_one_hot, axis=1) * (self.ph_discounted_reward.node - self.ph_value.node) + entropy * cfg.entropy_beta) else: policy_loss = -tf.reduce_sum( tf.reduce_sum(log_pi * action_one_hot, axis=1) * (self.ph_discounted_reward.node - self.ph_value.node)) # value loss (output) # (Learning rate for Critic is half of Actor's, it's l2 without dividing by 0.5) value_loss = tf.reduce_sum( tf.square(self.ph_discounted_reward.node - critic.node)) # gradient of policy and value are summed up return policy_loss + value_loss
def build_graph(self, actor, critic, cfg): self.ph_action = graph.Placeholder(np.int32, shape=(None, ), name="ph_action") self.ph_advantage = graph.Placeholder(np.float32, shape=(None, ), name="ph_adv") self.ph_discounted_reward = graph.Placeholder(np.float32, shape=(None, ), name="ph_edr") action_one_hot = tf.one_hot(self.ph_action.node, actor.action_size) # avoid NaN log_pi = tf.log(tf.maximum(actor.node, 1e-20)) # policy entropy self.entropy = -tf.reduce_sum(actor.node * log_pi) # policy loss self.policy_loss = -(tf.reduce_sum( tf.reduce_sum(log_pi * action_one_hot, axis=1) * self.ph_advantage.node) + self.entropy * cfg.entropy_beta) # value loss self.value_loss = tf.reduce_sum( tf.square(self.ph_discounted_reward.node - critic.node)) # gradient of policy and value are summed up # (Learning rate for the Critic is sized by critic_scale parameter) return self.policy_loss + cfg.critic_scale * self.value_loss
def build_graph(self): super(_WorkerNetwork, self).__init__() self.lstm = CustomBasicLSTMCell(cfg.d) # d=256 # needs wrap as layer to retrieve weights self.ph_goal =\ graph.Placeholder(np.float32, shape=(None, cfg.d), name="ph_goal") # self.ph_goal = tf.placeholder(tf.float32, [None, cfg.d], name="ph_goal") perception_expanded = graph.Expand(self.perception.node, 0) self.ph_step_size = \ graph.Placeholder(np.float32, shape=(1,), name="ph_w_step_size") # tf.placeholder(tf.float32, [1], name="ph_w_step_size") self.ph_initial_lstm_state = \ graph.Placeholder(np.float32, shape=(1, self.lstm.state_size), name="ph_w_lstm_state") # tf.placeholder(tf.float32, [1, self.lstm.state_size], name="ph_w_lstm_state") lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn( self.lstm, perception_expanded, initial_state=self.ph_initial_lstm_state, sequence_length=self.ph_step_size, time_major=False) lstm_outputs = tf.reshape(lstm_outputs, [-1, cfg.d]) sg_lstm_outputs = graph.TfNode(lstm_outputs) U = layer.LinearLayer(sg_lstm_outputs, shape=(cfg.d, cfg.action_size * cfg.k), transformation=tf.matmul) U_embedding = tf.transpose(tf.reshape(U, [cfg.action_size, cfg.k, -1])) w = layer.LinearLayer(self.ph_goal, shape=(cfg.d, cfg.k), transformation=tf.matmul, bias=False) w_reshaped = tf.reshape(w.node, [-1, 1, cfg.k]) self.pi = layer.MatmulLayer(w_reshaped, U_embedding, activation=layer.Activation.Softmax) self.vi = layer.LinearLayer(sg_lstm_outputs, shape=(cfg.d, 1), transformation=tf.matmul) self.weights = layer.Weights( self.weights, graph.TfNode((self.lstm.matrix, self.lstm.bias)), U, w, self.vi) self.lstm_state_out =\ graph.VarAssign(graph.Variable(np.zeros([1, self.lstm.state_size]), dtype=np.float32, name="lstm_state_out"), np.zeros([1, self.lstm.state_size]))
def build_graph(self, action_size, network): self.ph_action = graph.Placeholder(np.int32, (None, )) self.ph_discounted_reward = graph.Placeholder(np.float32, (None, 1)) # making actions that gave good advantage (reward over time) more likely, # and actions that didn't less likely. log_like_op = tf.log( tf.reduce_sum(tf.one_hot(self.ph_action.node, action_size) * network.node, axis=[1])) return -tf.reduce_sum(log_like_op * self.ph_discounted_reward.node)
def build_graph(self): input = layer.ConfiguredInput(trpo_config.config.input) # add one extra feature for timestep ph_step = graph.Placeholder(np.float32, shape=[None, 1]) state = (input.ph_state, ph_step) concatenated = graph.Concat([layer.Flatten(input), ph_step], axis=1) activation = layer.Activation.get_activation( trpo_config.config.activation) head = layer.GenericLayers(concatenated, [ dict(type=layer.Dense, size=size, activation=activation) for size in trpo_config.config.hidden_sizes ]) value = layer.Dense(head, 1) ph_ytarg_ny = graph.Placeholder(np.float32) mse = graph.TfNode( tf.reduce_mean(tf.square(ph_ytarg_ny.node - value.node))) weights = layer.Weights(input, head, value) sg_get_weights_flatten = graph.GetVariablesFlatten(weights) sg_set_weights_flatten = graph.SetVariablesFlatten(weights) l2 = graph.TfNode(1e-3 * tf.add_n([ tf.reduce_sum(tf.square(v)) for v in utils.Utils.flatten(weights.node) ])) loss = graph.TfNode(l2.node + mse.node) sg_gradients = optimizer.Gradients(weights, loss=loss) sg_gradients_flatten = graph.GetVariablesFlatten( sg_gradients.calculate) self.op_value = self.Op(value, state=state) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op( sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) self.op_compute_loss_and_gradient = self.Ops(loss, sg_gradients_flatten, state=state, ytarg_ny=ph_ytarg_ny) self.op_losses = self.Ops(loss, mse, l2, state=state, ytarg_ny=ph_ytarg_ny)
def build_graph(self, x, batch_size=1, n_units=256): self.phs = [ graph.Placeholder(np.float32, [batch_size, n_units]) for _ in range(2) ] self.ph_state = graph.TfNode(tuple(ph.node for ph in self.phs)) self.ph_state.checked = tuple(ph.checked for ph in self.phs) self.zero_state = tuple( np.zeros([batch_size, n_units]) for _ in range(2)) state = tf.contrib.rnn.LSTMStateTuple(*self.ph_state.checked) lstm = tf.contrib.rnn.BasicLSTMCell(n_units, state_is_tuple=True) outputs, self.state = tf.nn.dynamic_rnn(lstm, x.node, initial_state=state, sequence_length=tf.shape( x.node)[1:2], time_major=False) self.state = graph.TfNode(self.state) self.weight = graph.TfNode( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)) return outputs
def build_graph(self): input = layer.Input(cfg.config.input) self.ph_action = graph.Placeholder( np.float32, (None, cfg.config.output.action_size)) sizes = cfg.config.hidden_sizes assert len( sizes) > 1, 'You need to provide sizes at least for 2 layers' dense_1st = layer.Dense(layer.Flatten(input), sizes[0], layer.Activation.Relu) dense_2nd = layer.DoubleDense(dense_1st, self.ph_action, sizes[1], layer.Activation.Relu) layers = [input, dense_1st, dense_2nd] net = layer.GenericLayers(dense_2nd, [ dict(type=layer.Dense, size=size, activation=layer.Activation.Relu) for size in sizes[2:] ]) if len(sizes[2:]) > 0: layers.append(net) self.critic = layer.Dense(net, 1, init_var=3e-3) self.ph_state = input.ph_state layers.append(self.critic) self.weights = layer.Weights(*layers)
def build_graph(self): # Build graph state = graph.Placeholder(np.float32, shape=(2, )) reverse = graph.TfNode(tf.reverse(state.node, [0])) # Expose public API self.op_get_action = self.Op(reverse, state=state)
def build_graph(self, goal, critic): self.ph_stc_diff_st =\ graph.Placeholder(np.float32, shape=(None, cfg.d), name="ph_stc_diff_st") s_diff_normalized = tf.nn.l2_normalize(self.ph_stc_diff_st.node, dim=1) cosine_similarity = tf.matmul(s_diff_normalized, goal.node, transpose_b=True) cosine_similarity = tf.diag_part(cosine_similarity) # manager's advantage (R-V): R = ri + cfg.wGAMMA * R; AdvM = R - ViM self.ph_discounted_reward =\ graph.Placeholder(np.float32, shape=(None,), name="ph_m_discounted_reward") advantage = self.ph_discounted_reward.node - critic.node manager_loss = tf.reduce_sum(advantage * cosine_similarity) return manager_loss
def build_graph(self): self.ph_perception =\ graph.Placeholder(np.float32, shape=(None, cfg.d), name="ph_perception") # tf.placeholder(tf.float32, shape=[None, cfg.d], name="ph_perception") self.Mspace =\ layer.Dense(self.ph_perception, cfg.d, # d=256 activation=layer.Activation.Relu) Mspace_expanded = graph.Expand(self.Mspace, 0) self.lstm = DilatedLSTMCell(cfg.d, num_cores=cfg.d) # needs wrap as layer to retrieve weights self.ph_step_size =\ graph.Placeholder(np.float32, shape=(1,), name="ph_m_step_size") # tf.placeholder(tf.float32, [1], name="ph_m_step_size") self.ph_initial_lstm_state =\ graph.Placeholder(np.float32, shape=(1, self.lstm.state_size), name="ph_m_lstm_state") # tf.placeholder(tf.float32, [1, self.lstm.state_size], name="ph_m_lstm_state") lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn( self.lstm, Mspace_expanded, initial_state=self.ph_initial_lstm_state, sequence_length=self.ph_step_size, time_major=False) lstm_outputs = tf.reshape(lstm_outputs, [-1, cfg.d]) sg_lstm_outputs = graph.TfNode(lstm_outputs) self.goal = tf.nn.l2_normalize(graph.Flatten(sg_lstm_outputs), dim=1) critic = layer.Dense(sg_lstm_outputs, 1) self.value = layer.Flatten(critic) self.weights = layer.Weights( self.Mspace, graph.TfNode((self.lstm.matrix, self.lstm.bias)), critic) self.lstm_state_out =\ graph.VarAssign(graph.Variable(np.zeros([1, self.lstm.state_size]), dtype=np.float32, name="lstm_state_out"), np.zeros([1, self.lstm.state_size]))
def build_graph(self, actor, critic, cfg): self.ph_action = graph.Placeholder(np.float32, shape=(None, actor.action_size), name="ph_action") self.ph_advantage = graph.Placeholder(np.float32, shape=(None, ), name="ph_adv") self.ph_discounted_reward = graph.Placeholder(np.float32, shape=(None, ), name="ph_edr") mu, sigma2 = actor.node sigma2 += tf.constant(1e-8) normal_dist = tf.contrib.distributions.Normal(mu, sigma2) log_prob = normal_dist.log_prob(self.ph_action.node) if cfg.entropy_type == 'Gauss': self.entropy = tf.reduce_mean(normal_dist.entropy()) elif cfg.entropy_type == 'Origin': self.entropy = tf.reduce_mean(-0.5 * (tf.log(2 * np.pi * sigma2) + 1.0)) else: assert False, 'You should provide entropy type from 2 variants: Gauss or Origin' self.policy_loss = -(tf.reduce_mean( tf.reduce_sum(log_prob, axis=1) * self.ph_advantage.node) + cfg.entropy_beta * self.entropy) if cfg.policy_clip: self.policy_loss = tf.clip_by_value(self.policy_loss, -tf.abs(cfg.policy_clip), tf.abs(cfg.policy_clip)) # Learning rate for the Critic is sized by critic_scale parameter self.value_loss = cfg.critic_scale * tf.reduce_mean( tf.square(self.ph_discounted_reward.node - critic.node)) if cfg.critic_clip: self.value_loss = tf.clip_by_value(self.value_loss, -tf.abs(cfg.critic_clip), tf.abs(cfg.critic_clip))
def build_graph(self, x, batch_size, n_units, n_cores): lstm = graph.DilatedLSTMCell(n_units, n_cores) self.ph_state = graph.Placeholder(np.float32, [batch_size, lstm.state_size]) self.zero_state = np.zeros([batch_size, lstm.state_size]) outputs, self.state = tf.nn.dynamic_rnn(lstm, x.node, initial_state=self.ph_state.checked, sequence_length=tf.shape(x.node)[1:2], time_major=False) self.state = graph.TfNode(self.state) self.weight = graph.TfNode([lstm.matrix, lstm.bias]) self.reset_timestep = graph.TfNode(lstm.reset_timestep) return outputs
def build_graph(self): conv_layer = dict(type=layer.Convolution, activation=layer.Activation.Elu, n_filters=32, filter_size=[3, 3], stride=[2, 2], border=layer.Border.Same) input = layer.Input(cfg.config.input, descs=[dict(conv_layer)] * 4) shape = [None] + [cfg.config.output.action_size] self.ph_probs = graph.Placeholder(np.float32, shape=shape, name='act_probs') self.ph_taken = graph.Placeholder(np.int32, shape=(None,), name='act_taken') flattened_input = layer.Flatten(input) last_size = flattened_input.node.shape.as_list()[-1] inverse_inp = graph.Reshape(input, [-1, last_size*2]) get_first = graph.TfNode(inverse_inp.node[:, :last_size]) get_second = graph.TfNode(inverse_inp.node[:, last_size:]) forward_inp = graph.Concat([get_first, self.ph_probs], axis=1) fc_size = cfg.config.hidden_sizes[-1] inv_fc1 = layer.Dense(inverse_inp, fc_size, layer.Activation.Relu) inv_fc2 = layer.Dense(inv_fc1, shape[-1]) # layer.Activation.Softmax fwd_fc1 = layer.Dense(forward_inp, fc_size, layer.Activation.Relu) fwd_fc2 = layer.Dense(fwd_fc1, last_size) inv_loss = graph.SparseSoftmaxCrossEntropyWithLogits(inv_fc2, self.ph_taken).op fwd_loss = graph.L2loss(fwd_fc2.node - get_second.node).op self.ph_state = input.ph_state # should be even wrt to batch_size for now self.rew_out = graph.TfNode(cfg.config.icm.nu * fwd_loss) self.loss = graph.TfNode(cfg.config.icm.beta * fwd_loss + (1 - cfg.config.icm.beta) * inv_loss) layers = [input, inv_fc1, inv_fc2, fwd_fc1, fwd_fc2] self.weights = layer.Weights(*layers)
def build_graph(self): input_size, = trpo_config.config.input.shape # add one extra feature for timestep ph_state = graph.Placeholder(np.float32, shape=(None, input_size + 1)) activation = layer.Activation.get_activation(trpo_config.config.activation) descs = [dict(type=layer.Dense, size=size, activation=activation) for size in trpo_config.config.hidden_sizes] descs.append(dict(type=layer.Dense, size=1)) value = layer.GenericLayers(ph_state, descs) ph_ytarg_ny = graph.Placeholder(np.float32) mse = graph.TfNode(tf.reduce_mean(tf.square(ph_ytarg_ny.node - value.node))) weights = layer.Weights(value) sg_get_weights_flatten = GetVariablesFlatten(weights) sg_set_weights_flatten = SetVariablesFlatten(weights) l2 = graph.TfNode(1e-3 * tf.add_n([tf.reduce_sum(tf.square(v)) for v in utils.Utils.flatten(weights.node)])) loss = graph.TfNode(l2.node + mse.node) sg_gradients = optimizer.Gradients(weights, loss=loss) sg_gradients_flatten = GetVariablesFlatten(sg_gradients.calculate) self.op_value = self.Op(value, state=ph_state) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) self.op_compute_loss_and_gradient = self.Ops(loss, sg_gradients_flatten, state=ph_state, ytarg_ny=ph_ytarg_ny) self.op_losses = self.Ops(loss, mse, l2, state=ph_state, ytarg_ny=ph_ytarg_ny)
def build_graph(self, actor, critic, cfg): self.ph_action = graph.Placeholder(np.float32, shape=(None, actor.action_size), name="ph_action") self.ph_advantage = graph.Placeholder(np.float32, shape=(None, ), name="ph_adv") self.ph_discounted_reward = graph.Placeholder(np.float32, shape=(None, ), name="ph_edr") mu, sigma2 = actor.node sigma2 += tf.constant(1e-8) # policy entropy self.entropy = -tf.reduce_mean(0.5 * (tf.log(2. * np.pi * sigma2) + 1.)) # policy loss (calculation) b_size = tf.to_float(tf.size(self.ph_action.node) / actor.action_size) log_pi = tf.log(sigma2) x_prec = tf.exp(-log_pi) x_diff = tf.subtract(self.ph_action.node, mu) x_power = tf.square(x_diff) * x_prec * -0.5 gaussian_nll = (tf.reduce_sum(log_pi, axis=1) + b_size * tf.log(2. * np.pi)) / 2. - tf.reduce_sum( x_power, axis=1) self.policy_loss = -( tf.reduce_mean(gaussian_nll * self.ph_advantage.node) + cfg.entropy_beta * self.entropy) # value loss # (Learning rate for the Critic is sized by critic_scale parameter) self.value_loss = cfg.critic_scale * tf.reduce_mean( tf.square(self.ph_discounted_reward.node - critic.node))
def build_graph(self, kl_first_fixed, weights): weight_list = list(utils.Utils.flatten(weights.node)) gradients1 = tf.gradients(kl_first_fixed.node, weight_list) ph_tangent = graph.Placeholder(np.float32, shape=(None,)) gvp = [] start = 0 for g in gradients1: size = np.prod(g.shape.as_list()) gvp.append(tf.reduce_sum(tf.reshape(g, [-1]) * ph_tangent.node[start:start + size])) start += size gradients2 = tf.gradients(gvp, weight_list) fvp = tf.concat([tf.reshape(g, [-1]) for g in gradients2], axis=0) self.ph_tangent = ph_tangent return fvp
def build_graph(self, input): if hasattr(input, 'shape'): input_shape = input.shape else: input_shape = input.image if np.prod(input_shape) == 0: input_shape = [1] shape = [None] + input_shape + [input.history] self.ph_state = graph.Placeholder(np.float32, shape=shape) if len(shape) <= 4: state_input = self.ph_state.checked else: # move channels after history perm = list(range(len(shape))) perm = perm[0:3] + perm[-1:] + perm[3:-1] transpose = tf.transpose(self.ph_state.checked, perm=perm) # mix history and channels in one dimension state_input = tf.reshape(transpose, [-1] + shape[1:3] + [np.prod(shape[3:])]) return state_input
def build_graph(self, sg_value_net): # 'Observed' value of a state = discounted reward vf_scale = dppo_config.config.critic_scale ph_ytarg_ny = graph.Placeholder(np.float32) v1_loss = graph.TfNode(tf.square(sg_value_net.head.node - ph_ytarg_ny.node)) if dppo_config.config.vf_clipped_loss: ph_old_vpred = graph.Placeholder(np.float32) clip_e = dppo_config.config.clip_e vpredclipped = ph_old_vpred.node + tf.clip_by_value(sg_value_net.head.node - ph_old_vpred.node, -clip_e, clip_e) v2_loss = graph.TfNode(tf.square(vpredclipped - ph_ytarg_ny.node)) vf_mse = graph.TfNode(vf_scale * tf.reduce_mean(tf.maximum(v2_loss.node, v1_loss.node))) else: vf_mse = graph.TfNode(vf_scale * tf.reduce_mean(v1_loss.node)) if dppo_config.config.l2_coeff is not None: l2 = graph.TfNode(dppo_config.config.l2_coeff * tf.add_n([tf.reduce_sum(tf.square(v)) for v in utils.Utils.flatten(sg_value_net.weights.node)])) sg_vf_total_loss = graph.TfNode(l2.node + vf_mse.node) else: sg_vf_total_loss = vf_mse sg_gradients = optimizer.Gradients(sg_value_net.weights, loss=sg_vf_total_loss, norm=dppo_config.config.gradients_norm_clipping) sg_gradients_flatten = graph.GetVariablesFlatten(sg_gradients.calculate) # Op to compute value of a state if dppo_config.config.use_lstm: self.op_value = self.Ops(sg_value_net.head, sg_value_net.lstm_state, state=sg_value_net.ph_state, lstm_state=sg_value_net.ph_lstm_state) self.op_lstm_reset_timestep = self.Op(sg_value_net.lstm_reset_timestep) else: self.op_value = self.Op(sg_value_net.head, state=sg_value_net.ph_state) self.op_get_weights = self.Op(sg_value_net.weights) self.op_assign_weights = self.Op(sg_value_net.weights.assign, weights=sg_value_net.weights.ph_weights) sg_get_weights_flatten = graph.GetVariablesFlatten(sg_value_net.weights) sg_set_weights_flatten = graph.SetVariablesFlatten(sg_value_net.weights) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) feeds = dict(state=sg_value_net.ph_state, ytarg_ny=ph_ytarg_ny) if dppo_config.config.use_lstm: feeds.update(dict(lstm_state=sg_value_net.ph_lstm_state)) if dppo_config.config.vf_clipped_loss: feeds.update(dict(vpred_old=ph_old_vpred)) self.op_compute_gradients = self.Op(sg_gradients.calculate, **feeds) if dppo_config.config.use_lstm: self.op_compute_gradients = self.Ops(sg_gradients.calculate, sg_value_net.lstm_state, **feeds) self.op_compute_loss_and_gradient_flatten = self.Ops(sg_vf_total_loss, sg_gradients_flatten, **feeds) losses = [sg_vf_total_loss, vf_mse] if dppo_config.config.l2_coeff is not None: losses.append(l2) self.op_losses = self.Ops(*losses, **feeds) # Init Op for all weights sg_initialize = graph.Initialize() self.op_initialize = self.Op(sg_initialize)
def build_graph(self): # Build graph sg_actor_network = ActorNetwork() sg_critic_network = CriticNetwork() sg_actor_target_network = ActorNetwork() sg_critic_target_network = CriticNetwork() ph_action_gradient = graph.Placeholder(np.float32, (None, cfg.config.output.action_size)) actor_grad_args = dict(loss=sg_actor_network.actor, grad_ys=-ph_action_gradient.node) if cfg.config.no_ps: sg_actor_optimizer = optimizer.AdamOptimizer(cfg.config.actor_learning_rate) actor_grad_args.update(dict(optimizer=sg_actor_optimizer)) sg_actor_gradients = optimizer.Gradients(sg_actor_network.weights, **actor_grad_args) sg_critic_loss = loss.DDPGLoss(sg_critic_network, cfg.config) critic_grad_args = dict(loss=sg_critic_loss) if cfg.config.no_ps: sg_critic_optimizer = optimizer.AdamOptimizer(cfg.config.critic_learning_rate) critic_grad_args.update(dict(optimizer=sg_critic_optimizer)) sg_critic_gradients = optimizer.Gradients(sg_critic_network.weights, **critic_grad_args) sg_critic_action_gradients = optimizer.Gradients(sg_critic_network.ph_action, loss=sg_critic_network.critic) # Expose public API self.op_assign_actor_weights = self.Op(sg_actor_network.weights.assign, weights=sg_actor_network.weights.ph_weights) self.op_assign_critic_weights = self.Op(sg_critic_network.weights.assign, weights=sg_critic_network.weights.ph_weights) self.op_assign_actor_target_weights = self.Op(sg_actor_target_network.weights.assign, weights=sg_actor_target_network.weights.ph_weights) self.op_assign_critic_target_weights = self.Op(sg_critic_target_network.weights.assign, weights=sg_critic_target_network.weights.ph_weights) self.op_get_action = self.Op(sg_actor_network.actor, state=sg_actor_network.ph_state) self.op_get_critic_q = self.Op(sg_critic_network.critic, state=sg_critic_network.ph_state, action=sg_critic_network.ph_action) self.op_get_actor_target = self.Op(sg_actor_target_network.actor, state=sg_actor_target_network.ph_state) self.op_get_critic_target = self.Op(sg_critic_target_network.critic, state=sg_critic_target_network.ph_state, action=sg_critic_target_network.ph_action) self.op_compute_actor_gradients = self.Op(sg_actor_gradients.calculate, state=sg_actor_network.ph_state, grad_ys=ph_action_gradient) self.op_compute_critic_gradients = self.Op(sg_critic_gradients.calculate, state=sg_critic_network.ph_state, action=sg_critic_network.ph_action, predicted=sg_critic_loss.ph_predicted) self.op_compute_critic_action_gradients = self.Op(sg_critic_action_gradients.calculate, state=sg_critic_network.ph_state, action=sg_critic_network.ph_action) # Integrated with grad computation by log_lvl self.op_critic_loss = self.Op(sg_critic_loss, state=sg_critic_network.ph_state, action=sg_critic_network.ph_action, predicted=sg_critic_loss.ph_predicted) self.op_compute_norm_actor_gradients = self.Op(sg_actor_gradients.global_norm, state=sg_actor_network.ph_state, grad_ys=ph_action_gradient) self.op_compute_norm_critic_gradients = self.Op(sg_critic_gradients.global_norm, state=sg_critic_network.ph_state, action=sg_critic_network.ph_action, predicted=sg_critic_loss.ph_predicted) self.op_compute_norm_critic_action_gradients = self.Op(sg_critic_action_gradients.global_norm, state=sg_critic_network.ph_state, action=sg_critic_network.ph_action) if cfg.config.no_ps: sg_actor_weights = sg_actor_network.weights sg_critic_weights = sg_critic_network.weights sg_actor_target_weights = sg_actor_target_network.weights sg_critic_target_weights = sg_critic_target_network.weights # needs reassign weights from actor & critic to target networks sg_init_actor_target_weights = \ graph.AssignWeights(sg_actor_target_weights, sg_actor_weights).op sg_init_critic_target_weights = \ graph.AssignWeights(sg_critic_target_weights, sg_critic_weights).op sg_update_actor_target_weights = \ graph.AssignWeights(sg_actor_target_weights, sg_actor_weights, cfg.config.tau).op sg_update_critic_target_weights = \ graph.AssignWeights(sg_critic_target_weights, sg_critic_weights, cfg.config.tau).op self.op_get_weights = self.Ops(sg_actor_weights, sg_actor_target_weights, sg_critic_weights, sg_critic_target_weights) self.op_init_target_weights = self.Ops(sg_init_actor_target_weights, sg_init_critic_target_weights) self.op_update_target_weights = self.Ops(sg_update_actor_target_weights, sg_update_critic_target_weights) self.op_apply_actor_gradients = self.Ops(sg_actor_gradients.apply, gradients=sg_actor_gradients.ph_gradients) self.op_apply_critic_gradients = self.Op(sg_critic_gradients.apply, gradients=sg_critic_gradients.ph_gradients) sg_initialize = graph.Initialize() self.op_initialize = self.Op(sg_initialize)