def build_graph(self): # Build graph sg_global_step = graph.GlobalStep() sg_network = Network() self.actor = sg_network.actor self.critic = sg_network.critic if da3c_config.config.optimizer == 'Adam': sg_actor_optimizer = optimizer.AdamOptimizer( da3c_config.config.initial_learning_rate) sg_critic_optimizer = optimizer.AdamOptimizer( da3c_config.config.initial_learning_rate) else: sg_learning_rate = da3c_graph.LearningRate( sg_global_step, da3c_config.config.initial_learning_rate) sg_actor_optimizer = optimizer.RMSPropOptimizer( learning_rate=sg_learning_rate, decay=da3c_config.config.RMSProp.decay, momentum=0.0, epsilon=da3c_config.config.RMSProp.epsilon) sg_critic_optimizer = optimizer.RMSPropOptimizer( learning_rate=sg_learning_rate, decay=da3c_config.config.RMSProp.decay, momentum=0.0, epsilon=da3c_config.config.RMSProp.epsilon) sg_actor_gradients = optimizer.Gradients(self.actor.weights, optimizer=sg_actor_optimizer) sg_critic_gradients = optimizer.Gradients( self.critic.weights, optimizer=sg_critic_optimizer) if da3c_config.config.use_icm: sg_icm_optimizer = optimizer.AdamOptimizer( da3c_config.config.icm.lr) sg_icm_weights = icm_model.ICM().weights sg_icm_gradients = optimizer.Gradients(sg_icm_weights, optimizer=sg_icm_optimizer) # Expose ICM public API self.op_icm_get_weights = self.Op(sg_icm_weights) self.op_icm_apply_gradients = self.Op( sg_icm_gradients.apply, gradients=sg_icm_gradients.ph_gradients) sg_initialize = graph.Initialize() # Expose public API self.op_n_step = self.Op(sg_global_step.n) self.op_check_weights = self.Ops(self.actor.weights.check, self.critic.weights.check) self.op_get_weights = self.Ops(self.actor.weights, self.critic.weights) self.op_apply_gradients = self.Ops( sg_actor_gradients.apply, sg_critic_gradients.apply, sg_global_step.increment, gradients=(sg_actor_gradients.ph_gradients, sg_critic_gradients.ph_gradients), increment=sg_global_step.ph_increment) self.op_initialize = self.Op(sg_initialize)
def build_graph(self): # Build graph sg_network = Network() sg_loss = loss.DA3CLoss(sg_network.actor, sg_network.critic, da3c_config.config) sg_gradients = optimizer.Gradients(sg_network.weights, loss=sg_loss, norm=da3c_config.config.gradients_norm_clipping) if da3c_config.config.use_icm: sg_icm_network = icm_model.ICM() sg_icm_gradients = optimizer.Gradients(sg_icm_network.weights, loss=sg_icm_network.loss) # Expose ICM public API self.op_icm_assign_weights = self.Op(sg_icm_network.weights.assign, weights=sg_icm_network.weights.ph_weights) feeds = dict(state=sg_icm_network.ph_state, probs=sg_icm_network.ph_probs) self.op_get_intrinsic_reward = self.Ops(sg_icm_network.rew_out, **feeds) feeds.update(dict(action=sg_icm_network.ph_taken)) self.op_compute_icm_gradients = self.Op(sg_icm_gradients.calculate, **feeds) batch_size = tf.to_float(tf.shape(sg_network.ph_state.node)[0]) summaries = tf.summary.merge([ tf.summary.scalar('policy_loss', sg_loss.policy_loss / batch_size), tf.summary.scalar('value_loss', sg_loss.value_loss / batch_size), tf.summary.scalar('entropy', sg_loss.entropy / batch_size), tf.summary.scalar('gradients_global_norm', sg_gradients.global_norm), tf.summary.scalar('weights_global_norm', sg_network.weights.global_norm)]) # Expose public API self.op_assign_weights = self.Op(sg_network.weights.assign, weights=sg_network.weights.ph_weights) feeds = dict(state=sg_network.ph_state, action=sg_loss.ph_action, advantage=sg_loss.ph_advantage, discounted_reward=sg_loss.ph_discounted_reward) if da3c_config.config.use_lstm: feeds.update(dict(lstm_state=sg_network.ph_lstm_state)) self.lstm_zero_state = sg_network.lstm_zero_state self.op_get_action_value_and_lstm_state = self.Ops(sg_network.actor, sg_network.critic, sg_network.lstm_state, state=sg_network.ph_state, lstm_state=sg_network.ph_lstm_state) else: self.op_get_action_and_value = self.Ops(sg_network.actor, sg_network.critic, state=sg_network.ph_state) self.op_compute_gradients_and_summaries = self.Ops(sg_gradients.calculate, summaries, **feeds)
def build_graph(self): sg_weights = _ManagerNetwork().weights sg_global_step = graph.GlobalStep() # self.learning_rate_input = graph.Placeholder(np.float32, shape=(1,), name="manager_lr") # tf.placeholder(tf.float32, [], name="manager_lr") sg_learning_rate = fun_graph.LearningRate(sg_global_step) sg_optimizer = optimizer.RMSPropOptimizer( learning_rate=sg_learning_rate, decay=cfg.RMSProp.decay, momentum=0.0, epsilon=cfg.RMSProp.epsilon) sg_gradients = optimizer.Gradients(sg_weights, optimizer=sg_optimizer) sg_initialize = graph.Initialize() # Expose public API self.op_n_step = self.Op(sg_global_step.n) self.op_get_weights = self.Op(sg_weights) self.op_apply_gradients = self.Ops( sg_gradients.apply, sg_global_step.increment, gradients=sg_gradients.ph_gradients, increment=sg_global_step.ph_increment) self.op_initialize = self.Op(sg_initialize)
def build_graph(self): sg_network = Network() sg_get_weights_flatten = GetVariablesFlatten(sg_network.weights) sg_set_weights_flatten = SetVariablesFlatten(sg_network.weights) ph_adv_n = graph.TfNode(tf.placeholder(tf.float32, name='adv_n')) sg_probtype = ProbType(trpo_config.config.output.action_size) ph_oldprob_np = sg_probtype.ProbVariable() sg_logp_n = sg_probtype.Loglikelihood(sg_network.actor) sg_oldlogp_n = sg_probtype.Loglikelihood(ph_oldprob_np) sg_surr = graph.TfNode(-tf.reduce_mean(tf.exp(sg_logp_n.node - sg_oldlogp_n.node) * ph_adv_n.node)) sg_sum = tf.reduce_sum(sg_probtype.Kl(graph.TfNode(tf.stop_gradient(sg_network.actor.node)), sg_network.actor).node) sg_factor = tf.cast(tf.shape(sg_network.ph_state.node)[0], tf.float32) sg_kl_first_fixed = graph.TfNode(sg_sum / sg_factor) sg_kl = graph.TfNode(tf.reduce_mean(sg_probtype.Kl(ph_oldprob_np, sg_network.actor).node)) sg_fvp = FisherVectorProduct(sg_kl_first_fixed, sg_network.weights) sg_ent = graph.TfNode(tf.reduce_mean(sg_probtype.Entropy(sg_network.actor).node)) sg_gradients = optimizer.Gradients(sg_network.weights, loss=sg_surr) sg_gradients_flatten = GetVariablesFlatten(sg_gradients.calculate) self.op_get_weights = self.Op(sg_network.weights) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) self.op_compute_gradient = self.Op(sg_gradients_flatten, state=sg_network.ph_state, sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n, oldprob_np=ph_oldprob_np) self.op_losses = self.Ops(sg_surr, sg_kl, sg_ent, state=sg_network.ph_state, sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n, prob_variable=ph_oldprob_np) self.op_fisher_vector_product = self.Op(sg_fvp, tangent=sg_fvp.ph_tangent, state=sg_network.ph_state, sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n, prob_variable=ph_oldprob_np) # PPO clipped surrogate loss # likelihood ration of old and new policy r_theta = tf.exp(sg_logp_n.node - sg_oldlogp_n.node) surr = r_theta * ph_adv_n.node clip_e = trpo_config.config.PPO.clip_e surr_clipped = tf.clip_by_value(r_theta, 1.0 - clip_e, 1.0 + clip_e) * ph_adv_n.node sg_ppo_loss = graph.TfNode(-tf.reduce_mean(tf.minimum(surr, surr_clipped))) sg_minimize = graph.TfNode(tf.train.AdamOptimizer( learning_rate=trpo_config.config.PPO.learning_rate).minimize(sg_ppo_loss.node)) self.op_ppo_optimize = self.Op(sg_minimize, state=sg_network.ph_state, sampled_variable=sg_probtype.ph_sampled_variable, adv_n=ph_adv_n, oldprob_np=ph_oldprob_np)
def build_graph(self): sg_global_step = graph.GlobalStep() sg_network = Network() sg_get_weights_flatten = graph.GetVariablesFlatten(sg_network.weights) sg_set_weights_flatten = graph.SetVariablesFlatten(sg_network.weights) if config.use_linear_schedule: sg_learning_rate = lr_schedule.Linear(sg_global_step, config) else: sg_learning_rate = config.initial_learning_rate if config.optimizer == 'Adam': sg_optimizer = optimizer.AdamOptimizer(sg_learning_rate) elif config.optimizer == 'RMSProp': sg_optimizer = optimizer.RMSPropOptimizer( learning_rate=sg_learning_rate, decay=config.RMSProp.decay, epsilon=config.RMSProp.epsilon) else: assert False, 'There 2 valid options for optimizers: Adam | RMSProp' sg_gradients_apply = optimizer.Gradients(sg_network.weights, optimizer=sg_optimizer) sg_average_reward = graph.LinearMovingAverage( config.avg_in_num_batches) sg_initialize = graph.Initialize() # Expose public API self.op_n_step = self.Op(sg_global_step.n) self.op_score = self.Op(sg_average_reward.average) self.op_get_weights_signed = self.Ops(sg_network.weights, sg_global_step.n) self.op_assign_weights = self.Op(sg_network.weights.assign, weights=sg_network.weights.ph_weights) self.op_apply_gradients = self.Ops( sg_gradients_apply.apply, sg_global_step.increment, gradients=sg_gradients_apply.ph_gradients, increment=sg_global_step.ph_increment) self.op_add_rewards_to_model_score_routine = self.Ops( sg_average_reward.add, reward_sum=sg_average_reward.ph_sum, reward_weight=sg_average_reward.ph_count) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op( sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) # Gradient combining routines self.op_submit_gradients = self.Call( graph.get_gradients_apply_routine(config)) self.op_initialize = self.Op(sg_initialize)
def build_graph(self): self.sg_network = _WorkerNetwork() sg_loss = fun_graph.A3CLoss(self.sg_network.pi, self.sg_network.vi, entropy=False) sg_gradients = optimizer.Gradients(self.sg_network.weights, loss=sg_loss) # Expose public API self.op_assign_weights = self.Op( self.sg_network.weights.assign, weights=self.sg_network.weights.ph_weights) self.op_compute_gradients = \ self.Op(sg_gradients.calculate, ph_state=self.sg_network.ph_state, ph_goal=self.sg_network.ph_goal, ph_action=sg_loss.ph_action, ph_value=sg_loss.ph_value, ph_discounted_reward=sg_loss.ph_discounted_reward, ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state, ph_step_size=self.sg_network.ph_step_size) self.op_reset_lstm_state = self.Op( self.sg_network.lstm_state_out.assign_from_value) self.op_assign_lstm_state = self.Op( self.sg_network.lstm_state_out.assign_from_ph, ph_variable=self.sg_network.lstm_state) self.op_get_lstm_state = self.sg_network.lstm_state_out.node # without lstm state freezes self.op_get_zt = self.Op(self.sg_network.perception, ph_state=self.sg_network.ph_state) self.op_get_action_and_value = self.Ops( self.sg_network.pi, self.sg_network.vi, self.sg_network.lstm_state, ph_state=self.sg_network.ph_state, ph_goal=self.sg_network.ph_goal, ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state, ph_step_size=self.sg_network.ph_step_size) self.op_get_action = self.Ops( # use for exploitation self.sg_network.pi, self.sg_network.lstm_state, ph_state=self.sg_network.ph_state, ph_goal=self.sg_network.ph_goal, ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state, ph_step_size=self.sg_network.ph_step_size) # with lstm state freezes self.op_get_value_zt = self.Ops( self.sg_network.perception, self.sg_network.vi, self.sg_network.lstm_state, ph_state=self.sg_network.ph_state, ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state, ph_step_size=self.sg_network.ph_step_size)
def build_graph(self): self.sg_network = _ManagerNetwork() sg_loss = fun_graph.CosineLoss(self.sg_network.goal, self.sg_network.value) sg_gradients = optimizer.Gradients(self.sg_network.weights, loss=sg_loss) # Expose public API self.op_assign_weights = self.Op( self.sg_network.weights.assign, ph_weights=self.sg_network.weights.ph_weights) self.op_compute_gradients =\ self.Op(sg_gradients.calculate, ph_perception=self.sg_network.ph_perception, ph_stc_diff_st=sg_loss.ph_stc_diff_st, ph_discounted_reward=sg_loss.ph_discounted_reward, ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state, ph_step_size=self.sg_network.ph_step_size) self.op_reset_lstm_state = self.Op( self.sg_network.lstm_state_out.assign_from_value) self.op_assign_lstm_state = self.Op( self.sg_network.lstm_state_out.assign_from_ph, ph_variable=self.sg_network.lstm_state) self.op_get_lstm_state = self.sg_network.lstm_state_out.node # without lstm state freezes self.op_get_goal_value_st = self.Ops( self.sg_network.goal, self.sg_network.value, self.sg_network.Mspace, self.sg_network.lstm_state, ph_perception=self.sg_network.ph_perception, ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state, ph_step_size=self.sg_network.ph_step_size) self.op_get_st = self.Op(self.sg_network.Mspace, ph_perception=self.sg_network.ph_perception) # with lstm state freezes self.op_get_goal_st = self.Ops( self.sg_network.goal, self.sg_network.Mspace, self.sg_network.lstm_state, ph_perception=self.sg_network.ph_perception, ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state, ph_step_size=self.sg_network.ph_step_size) self.op_get_value = self.Ops( self.sg_network.value, self.sg_network.lstm_state, ph_perception=self.sg_network.ph_perception, ph_initial_lstm_state=self.sg_network.ph_initial_lstm_state, ph_step_size=self.sg_network.ph_step_size)
def build_graph(self, weights): # Build graph sg_global_step = graph.GlobalStep() sg_update_step = graph.GlobalStep() sg_weights = weights if dppo_config.config.use_linear_schedule: if dppo_config.config.schedule_step == 'update': sg_schedule_step = sg_update_step elif dppo_config.config.schedule_step == 'environment': sg_schedule_step = sg_global_step else: assert False, 'Valid options for the schedule step are: update OR environment.' \ 'You provide the following option:'.format(dppo_config.config.schedule_step) sg_learning_rate = lr_schedule.Linear(sg_schedule_step, dppo_config.config) else: sg_learning_rate = dppo_config.config.initial_learning_rate sg_optimizer = optimizer.AdamOptimizer(sg_learning_rate, epsilon=dppo_config.config.optimizer.epsilon) sg_gradients = optimizer.Gradients(sg_weights, optimizer=sg_optimizer) sg_average_reward = graph.LinearMovingAverage(dppo_config.config.avg_in_num_batches) sg_initialize = graph.Initialize() # Weights get/set for updating the policy sg_get_weights_flatten = graph.GetVariablesFlatten(sg_weights) sg_set_weights_flatten = graph.SetVariablesFlatten(sg_weights) # Expose public API self.op_n_step = self.Op(sg_global_step.n) self.op_upd_step = self.Op(sg_update_step.n) self.op_score = self.Op(sg_average_reward.average) self.op_inc_global_step = self.Ops(sg_global_step.increment, increment=sg_global_step.ph_increment) self.op_inc_global_step_and_average_reward = self.Ops(sg_global_step.increment, sg_average_reward.add, increment=sg_global_step.ph_increment, reward_sum=sg_average_reward.ph_sum, reward_weight=sg_average_reward.ph_count) self.op_get_weights = self.Op(sg_weights) self.op_get_weights_signed = self.Ops(sg_weights, sg_update_step.n) self.op_apply_gradients = self.Ops(sg_gradients.apply, sg_update_step.increment, gradients=sg_gradients.ph_gradients, increment=sg_update_step.ph_increment) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) # Gradient combining routines self.op_submit_gradients = self.Call(graph.get_gradients_apply_routine(dppo_config.config)) self.op_initialize = self.Op(sg_initialize)
def build_graph(self): input = layer.ConfiguredInput(trpo_config.config.input) # add one extra feature for timestep ph_step = graph.Placeholder(np.float32, shape=[None, 1]) state = (input.ph_state, ph_step) concatenated = graph.Concat([layer.Flatten(input), ph_step], axis=1) activation = layer.Activation.get_activation( trpo_config.config.activation) head = layer.GenericLayers(concatenated, [ dict(type=layer.Dense, size=size, activation=activation) for size in trpo_config.config.hidden_sizes ]) value = layer.Dense(head, 1) ph_ytarg_ny = graph.Placeholder(np.float32) mse = graph.TfNode( tf.reduce_mean(tf.square(ph_ytarg_ny.node - value.node))) weights = layer.Weights(input, head, value) sg_get_weights_flatten = graph.GetVariablesFlatten(weights) sg_set_weights_flatten = graph.SetVariablesFlatten(weights) l2 = graph.TfNode(1e-3 * tf.add_n([ tf.reduce_sum(tf.square(v)) for v in utils.Utils.flatten(weights.node) ])) loss = graph.TfNode(l2.node + mse.node) sg_gradients = optimizer.Gradients(weights, loss=loss) sg_gradients_flatten = graph.GetVariablesFlatten( sg_gradients.calculate) self.op_value = self.Op(value, state=state) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op( sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) self.op_compute_loss_and_gradient = self.Ops(loss, sg_gradients_flatten, state=state, ytarg_ny=ph_ytarg_ny) self.op_losses = self.Ops(loss, mse, l2, state=state, ytarg_ny=ph_ytarg_ny)
def build_graph(self): # Build graph sg_global_step = graph.GlobalStep() sg_weights = Network().weights sg_optimizer = optimizer.AdamOptimizer(pg_config.config.learning_rate) sg_gradients = optimizer.Gradients(sg_weights, optimizer=sg_optimizer) sg_initialize = graph.Initialize() # Expose public API self.op_n_step = self.Op(sg_global_step.n) self.op_get_weights = self.Op(sg_weights) self.op_apply_gradients = self.Ops(sg_gradients.apply, sg_global_step.increment, gradients=sg_gradients.ph_gradients, increment=sg_global_step.ph_increment) self.op_initialize = self.Op(sg_initialize)
def build_graph(self): # Build graph sg_network = Network() sg_loss = loss.PGLoss(action_size=pg_config.config.output.action_size, network=sg_network) sg_gradients = optimizer.Gradients(sg_network.weights, loss=sg_loss) # Expose public API self.op_assign_weights = self.Op(sg_network.weights.assign, weights=sg_network.weights.ph_weights) self.op_get_action = self.Op(sg_network, state=sg_network.state) self.op_compute_gradients = self.Op(sg_gradients.calculate, state=sg_network.state, action=sg_loss.ph_action, discounted_reward=sg_loss.ph_discounted_reward)
def build_graph(self): sg_network = Network() sg_target_network = Network() sg_get_action = Actor() sg_loss = loss.DQNLoss(sg_network.output, config) sg_gradients_calc = optimizer.Gradients(sg_network.weights, loss=sg_loss) sg_update_target_weights = graph.AssignWeights( sg_target_network.weights, sg_network.weights).op # Expose public API self.op_assign_weights = self.Op(sg_network.weights.assign, weights=sg_network.weights.ph_weights) self.op_assign_target_weights = self.Op( sg_target_network.weights.assign, target_weights=sg_target_network.weights.ph_weights) self.op_get_q_value = self.Op(sg_network.output.node, state=sg_network.ph_state) self.op_get_q_target_value = self.Op( sg_target_network.output.node, next_state=sg_target_network.ph_state) self.op_get_action = self.Op(sg_get_action, local_step=sg_get_action.ph_local_step, q_value=sg_get_action.ph_q_value) sg_initialize = graph.Initialize() feeds = dict(state=sg_network.ph_state, reward=sg_loss.ph_reward, action=sg_loss.ph_action, terminal=sg_loss.ph_terminal, q_next_target=sg_loss.ph_q_next_target, q_next=sg_loss.ph_q_next) self.op_compute_gradients = self.Op(sg_gradients_calc.calculate, **feeds) self.op_update_target_weights = self.Op(sg_update_target_weights) self.op_initialize = self.Op(sg_initialize)
def build_graph(self): sg_global_step = graph.GlobalStep() sg_network = Network() if config.optimizer == 'Adam': sg_optimizer = optimizer.AdamOptimizer( config.initial_learning_rate) elif config.optimizer == 'RMSProp': param = {} if hasattr(config, 'RMSProp'): if hasattr(config.RMSProp, "decay"): param["decay"] = config.RMSProp.decay if hasattr(config.RMSProp, "epsilon"): param["epsilon"] = config.RMSProp.epsilon sg_optimizer = optimizer.RMSPropOptimizer( config.initial_learning_rate, **param) else: raise NotImplementedError sg_gradients_apply = optimizer.Gradients(sg_network.weights, optimizer=sg_optimizer) sg_initialize = graph.Initialize() # Expose public API self.op_n_step = self.Op(sg_global_step.n) self.op_get_weights = self.Op(sg_network.weights) self.op_assign_weights = self.Op(sg_network.weights.assign, weights=sg_network.weights.ph_weights) self.op_apply_gradients = self.Ops( sg_gradients_apply.apply, sg_global_step.increment, gradients=sg_gradients_apply.ph_gradients, n_steps=sg_global_step.ph_increment) self.op_initialize = self.Op(sg_initialize)
def build_graph(self): input_size, = trpo_config.config.input.shape # add one extra feature for timestep ph_state = graph.Placeholder(np.float32, shape=(None, input_size + 1)) activation = layer.Activation.get_activation(trpo_config.config.activation) descs = [dict(type=layer.Dense, size=size, activation=activation) for size in trpo_config.config.hidden_sizes] descs.append(dict(type=layer.Dense, size=1)) value = layer.GenericLayers(ph_state, descs) ph_ytarg_ny = graph.Placeholder(np.float32) mse = graph.TfNode(tf.reduce_mean(tf.square(ph_ytarg_ny.node - value.node))) weights = layer.Weights(value) sg_get_weights_flatten = GetVariablesFlatten(weights) sg_set_weights_flatten = SetVariablesFlatten(weights) l2 = graph.TfNode(1e-3 * tf.add_n([tf.reduce_sum(tf.square(v)) for v in utils.Utils.flatten(weights.node)])) loss = graph.TfNode(l2.node + mse.node) sg_gradients = optimizer.Gradients(weights, loss=loss) sg_gradients_flatten = GetVariablesFlatten(sg_gradients.calculate) self.op_value = self.Op(value, state=ph_state) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) self.op_compute_loss_and_gradient = self.Ops(loss, sg_gradients_flatten, state=ph_state, ytarg_ny=ph_ytarg_ny) self.op_losses = self.Ops(loss, mse, l2, state=ph_state, ytarg_ny=ph_ytarg_ny)
def build_graph(self): sg_weights = _WorkerNetwork().weights sg_global_step = graph.GlobalStep() sg_learning_rate = fun_graph.LearningRate(sg_global_step) sg_optimizer = optimizer.RMSPropOptimizer( learning_rate=sg_learning_rate, decay=cfg.RMSProp.decay, momentum=0.0, epsilon=cfg.RMSProp.epsilon) sg_gradients = optimizer.Gradients(sg_weights, optimizer=sg_optimizer) sg_initialize = graph.Initialize() # Expose public API self.op_n_step = self.Op(sg_global_step.n) self.op_get_weights = self.Op(sg_weights) self.op_apply_gradients = self.Ops( sg_gradients.apply, sg_global_step.increment, gradients=sg_gradients.ph_gradients, increment=sg_global_step.ph_increment) self.op_initialize = self.Op(sg_initialize)
def build_graph(self, sg_value_net): # 'Observed' value of a state = discounted reward vf_scale = dppo_config.config.critic_scale ph_ytarg_ny = graph.Placeholder(np.float32) v1_loss = graph.TfNode(tf.square(sg_value_net.head.node - ph_ytarg_ny.node)) if dppo_config.config.vf_clipped_loss: ph_old_vpred = graph.Placeholder(np.float32) clip_e = dppo_config.config.clip_e vpredclipped = ph_old_vpred.node + tf.clip_by_value(sg_value_net.head.node - ph_old_vpred.node, -clip_e, clip_e) v2_loss = graph.TfNode(tf.square(vpredclipped - ph_ytarg_ny.node)) vf_mse = graph.TfNode(vf_scale * tf.reduce_mean(tf.maximum(v2_loss.node, v1_loss.node))) else: vf_mse = graph.TfNode(vf_scale * tf.reduce_mean(v1_loss.node)) if dppo_config.config.l2_coeff is not None: l2 = graph.TfNode(dppo_config.config.l2_coeff * tf.add_n([tf.reduce_sum(tf.square(v)) for v in utils.Utils.flatten(sg_value_net.weights.node)])) sg_vf_total_loss = graph.TfNode(l2.node + vf_mse.node) else: sg_vf_total_loss = vf_mse sg_gradients = optimizer.Gradients(sg_value_net.weights, loss=sg_vf_total_loss, norm=dppo_config.config.gradients_norm_clipping) sg_gradients_flatten = graph.GetVariablesFlatten(sg_gradients.calculate) # Op to compute value of a state if dppo_config.config.use_lstm: self.op_value = self.Ops(sg_value_net.head, sg_value_net.lstm_state, state=sg_value_net.ph_state, lstm_state=sg_value_net.ph_lstm_state) self.op_lstm_reset_timestep = self.Op(sg_value_net.lstm_reset_timestep) else: self.op_value = self.Op(sg_value_net.head, state=sg_value_net.ph_state) self.op_get_weights = self.Op(sg_value_net.weights) self.op_assign_weights = self.Op(sg_value_net.weights.assign, weights=sg_value_net.weights.ph_weights) sg_get_weights_flatten = graph.GetVariablesFlatten(sg_value_net.weights) sg_set_weights_flatten = graph.SetVariablesFlatten(sg_value_net.weights) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) feeds = dict(state=sg_value_net.ph_state, ytarg_ny=ph_ytarg_ny) if dppo_config.config.use_lstm: feeds.update(dict(lstm_state=sg_value_net.ph_lstm_state)) if dppo_config.config.vf_clipped_loss: feeds.update(dict(vpred_old=ph_old_vpred)) self.op_compute_gradients = self.Op(sg_gradients.calculate, **feeds) if dppo_config.config.use_lstm: self.op_compute_gradients = self.Ops(sg_gradients.calculate, sg_value_net.lstm_state, **feeds) self.op_compute_loss_and_gradient_flatten = self.Ops(sg_vf_total_loss, sg_gradients_flatten, **feeds) losses = [sg_vf_total_loss, vf_mse] if dppo_config.config.l2_coeff is not None: losses.append(l2) self.op_losses = self.Ops(*losses, **feeds) # Init Op for all weights sg_initialize = graph.Initialize() self.op_initialize = self.Op(sg_initialize)
def build_graph(self, sg_network): if dppo_config.config.use_lstm: self.op_get_action = self.Ops(sg_network.head, sg_network.lstm_state, state=sg_network.ph_state, lstm_state=sg_network.ph_lstm_state) self.op_lstm_reset_timestep = self.Op(sg_network.lstm_reset_timestep) else: self.op_get_action = self.Op(sg_network.head, state=sg_network.ph_state) # Advantage node ph_adv_n = graph.TfNode(tf.placeholder(tf.float32, name='adv_n')) # Contains placeholder for the actual action made by the agent sg_probtype = ProbType(dppo_config.config.output.action_size, continuous=dppo_config.config.output.continuous) # Placeholder to store action probabilities under the old policy ph_oldprob_np = sg_probtype.ProbVariable() sg_logp_n = sg_probtype.Loglikelihood(sg_network.head) sg_oldlogp_n = sg_probtype.Loglikelihood(ph_oldprob_np) # PPO clipped surrogate loss # likelihood ratio of old and new policy r_theta = tf.exp(sg_logp_n.node - sg_oldlogp_n.node) surr = r_theta * ph_adv_n.node clip_e = dppo_config.config.clip_e surr_clipped = tf.clip_by_value(r_theta, 1.0 - clip_e, 1.0 + clip_e) * ph_adv_n.node sg_pol_clip_loss = graph.TfNode(-tf.reduce_mean(tf.minimum(surr, surr_clipped))) # PPO entropy loss if dppo_config.config.entropy is not None: sg_entropy = sg_probtype.Entropy(sg_network.head) sg_ent_loss = (-dppo_config.config.entropy) * tf.reduce_mean(sg_entropy.node) sg_pol_total_loss = graph.TfNode(sg_pol_clip_loss.node + sg_ent_loss) else: sg_pol_total_loss = sg_pol_clip_loss # Regular gradients sg_ppo_clip_gradients = optimizer.Gradients(sg_network.weights, loss=sg_pol_total_loss, norm=dppo_config.config.gradients_norm_clipping) feeds = dict(state=sg_network.ph_state, action=sg_probtype.ph_sampled_variable, advantage=ph_adv_n, old_prob=ph_oldprob_np) if dppo_config.config.use_lstm: feeds.update(dict(lstm_state=sg_network.ph_lstm_state)) self.op_compute_ppo_clip_gradients = self.Op(sg_ppo_clip_gradients.calculate, **feeds) if dppo_config.config.use_lstm: self.op_compute_ppo_clip_gradients = self.Ops(sg_ppo_clip_gradients.calculate, sg_network.lstm_state, **feeds) # Weights get/set for updating the policy sg_get_weights_flatten = graph.GetVariablesFlatten(sg_network.weights) sg_set_weights_flatten = graph.SetVariablesFlatten(sg_network.weights) self.op_get_weights = self.Op(sg_network.weights) self.op_assign_weights = self.Op(sg_network.weights.assign, weights=sg_network.weights.ph_weights) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) # Init Op for all weights sg_initialize = graph.Initialize() self.op_initialize = self.Op(sg_initialize)
def build_graph(self): # Build graph sg_global_step = graph.GlobalStep() sg_episode_cnt = graph.GlobalStep() sg_actor_weights = ActorNetwork().weights sg_critic_weights = CriticNetwork().weights sg_actor_target_weights = ActorNetwork().weights sg_critic_target_weights = CriticNetwork().weights # needs reassign weights from actor & critic to target networks sg_init_actor_target_weights = \ graph.AssignWeights(sg_actor_target_weights, sg_actor_weights).op sg_init_critic_target_weights = \ graph.AssignWeights(sg_critic_target_weights, sg_critic_weights).op sg_update_actor_target_weights = \ graph.AssignWeights(sg_actor_target_weights, sg_actor_weights, cfg.config.tau).op sg_update_critic_target_weights = \ graph.AssignWeights(sg_critic_target_weights, sg_critic_weights, cfg.config.tau).op sg_actor_optimizer = optimizer.AdamOptimizer( cfg.config.actor_learning_rate) sg_critic_optimizer = optimizer.AdamOptimizer( cfg.config.critic_learning_rate) sg_actor_gradients = optimizer.Gradients(sg_actor_weights, optimizer=sg_actor_optimizer) sg_critic_gradients = optimizer.Gradients( sg_critic_weights, optimizer=sg_critic_optimizer) sg_initialize = graph.Initialize() # Expose public API self.op_get_weights = self.Ops(sg_actor_weights, sg_actor_target_weights, sg_critic_weights, sg_critic_target_weights) self.op_init_target_weights = self.Ops(sg_init_actor_target_weights, sg_init_critic_target_weights) self.op_update_target_weights = self.Ops( sg_update_actor_target_weights, sg_update_critic_target_weights) self.op_apply_actor_gradients = self.Ops( sg_actor_gradients.apply, sg_global_step.increment, gradients=sg_actor_gradients.ph_gradients, increment=sg_global_step.ph_increment) self.op_apply_critic_gradients = self.Op( sg_critic_gradients.apply, gradients=sg_critic_gradients.ph_gradients) self.op_n_step = self.Op(sg_global_step.n) self.op_inc_step = self.Op(sg_global_step.increment, increment=sg_global_step.ph_increment) self.op_get_episode_cnt = self.Op(sg_episode_cnt.n) self.op_inc_episode_cnt = self.Op( sg_episode_cnt.increment, increment=sg_episode_cnt.ph_increment) self.op_initialize = self.Op(sg_initialize)
def build_graph(self): # Build graph sg_global_step = graph.GlobalStep() sg_network = Network() sg_weights = sg_network.weights if da3c_config.config.use_linear_schedule: sg_learning_rate = lr_schedule.Linear(sg_global_step, da3c_config.config) else: sg_learning_rate = da3c_config.config.initial_learning_rate if da3c_config.config.optimizer == 'Adam': sg_optimizer = optimizer.AdamOptimizer(sg_learning_rate) else: sg_optimizer = optimizer.RMSPropOptimizer( learning_rate=sg_learning_rate, decay=da3c_config.config.RMSProp.decay, momentum=0.0, epsilon=da3c_config.config.RMSProp.epsilon) sg_gradients = optimizer.Gradients(sg_weights, optimizer=sg_optimizer) if da3c_config.config.use_icm: sg_icm_optimizer = optimizer.AdamOptimizer( da3c_config.config.icm.lr) sg_icm_weights = icm_model.ICM().weights sg_icm_gradients = optimizer.Gradients(sg_icm_weights, optimizer=sg_icm_optimizer) # Expose ICM public API self.op_icm_get_weights = self.Op(sg_icm_weights) self.op_icm_apply_gradients = self.Op( sg_icm_gradients.apply, gradients=sg_icm_gradients.ph_gradients) sg_average_reward = graph.LinearMovingAverage( da3c_config.config.avg_in_num_batches) sg_initialize = graph.Initialize() # Expose public API self.op_n_step = self.Op(sg_global_step.n) self.op_score = self.Op(sg_average_reward.average) self.op_check_weights = self.Op(sg_weights.check) self.op_get_weights = self.Ops(sg_weights, sg_global_step.n) self.op_apply_gradients = self.Ops( sg_gradients.apply, sg_global_step.increment, gradients=sg_gradients.ph_gradients, increment=sg_global_step.ph_increment) self.op_add_rewards_to_model_score_routine = self.Ops( sg_average_reward.add, reward_sum=sg_average_reward.ph_sum, reward_weight=sg_average_reward.ph_count) # Determine Gradients' applying methods: fifo (by default), averaging, delay compensation sg_get_weights_flatten = graph.GetVariablesFlatten(sg_weights) sg_set_weights_flatten = graph.SetVariablesFlatten(sg_weights) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op( sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) self.op_submit_gradients = self.Call( graph.get_gradients_apply_routine(da3c_config.config)) self.op_initialize = self.Op(sg_initialize)
def build_graph(self): # Build graph sg_network = Network() self.actor = sg_network.actor self.critic = sg_network.critic sg_loss = loss.DA3CLoss(sg_network.actor.head, sg_network.critic.head, da3c_config.config) sg_actor_gradients = optimizer.Gradients( sg_network.actor.weights, loss=graph.TfNode(sg_loss.policy_loss), norm=da3c_config.config.gradients_norm_clipping) sg_critic_gradients = optimizer.Gradients( sg_network.critic.weights, loss=graph.TfNode(sg_loss.value_loss), norm=da3c_config.config.gradients_norm_clipping) if da3c_config.config.use_icm: sg_icm_network = icm_model.ICM() sg_icm_gradients = optimizer.Gradients(sg_icm_network.weights, loss=sg_icm_network.loss) # Expose ICM public API self.op_icm_assign_weights = self.Op( sg_icm_network.weights.assign, weights=sg_icm_network.weights.ph_weights) feeds = dict(state=sg_icm_network.ph_state, probs=sg_icm_network.ph_probs) self.op_get_intrinsic_reward = self.Ops(sg_icm_network.rew_out, **feeds) feeds.update(dict(action=sg_icm_network.ph_taken)) self.op_compute_icm_gradients = self.Op(sg_icm_gradients.calculate, **feeds) summaries = tf.summary.merge([ tf.summary.scalar('policy_loss', sg_loss.policy_loss), tf.summary.scalar('value_loss', sg_loss.value_loss), tf.summary.scalar('entropy', sg_loss.entropy), tf.summary.scalar('actor_gradients_global_norm', sg_actor_gradients.global_norm), tf.summary.scalar('critic_gradients_global_norm', sg_critic_gradients.global_norm), tf.summary.scalar('actor_weights_global_norm', sg_network.actor.weights.global_norm), tf.summary.scalar('critic_weights_global_norm', sg_network.critic.weights.global_norm) ]) # Expose public API self.op_assign_weights = self.Ops( sg_network.actor.weights.assign, sg_network.critic.weights.assign, weights=(sg_network.actor.weights.ph_weights, sg_network.critic.weights.ph_weights)) feeds = dict(state=sg_network.ph_state, action=sg_loss.ph_action, advantage=sg_loss.ph_advantage, discounted_reward=sg_loss.ph_discounted_reward) if da3c_config.config.use_lstm: feeds.update( dict(lstm_state=(sg_network.actor.ph_lstm_state, sg_network.critic.ph_lstm_state))) self.lstm_zero_state = (sg_network.actor.lstm_zero_state, sg_network.critic.lstm_zero_state) self.op_lstm_reset_timestep = self.Ops( sg_network.actor.lstm_reset_timestep, sg_network.critic.lstm_reset_timestep) self.op_get_action_value_and_lstm_state = \ self.Ops(sg_network.actor.head, sg_network.critic.head, (sg_network.actor.lstm_state, sg_network.critic.lstm_state), state=sg_network.ph_state, lstm_state=(sg_network.actor.ph_lstm_state, sg_network.critic.ph_lstm_state)) else: self.op_get_action_and_value = self.Ops(sg_network.actor.head, sg_network.critic.head, state=sg_network.ph_state) self.op_compute_gradients_and_summaries = \ self.Ops((sg_actor_gradients.calculate, sg_critic_gradients.calculate), summaries, **feeds)
def build_graph(self): # Build graph sg_actor_network = ActorNetwork() sg_critic_network = CriticNetwork() sg_actor_target_network = ActorNetwork() sg_critic_target_network = CriticNetwork() ph_action_gradient = graph.Placeholder(np.float32, (None, cfg.config.output.action_size)) actor_grad_args = dict(loss=sg_actor_network.actor, grad_ys=-ph_action_gradient.node) if cfg.config.no_ps: sg_actor_optimizer = optimizer.AdamOptimizer(cfg.config.actor_learning_rate) actor_grad_args.update(dict(optimizer=sg_actor_optimizer)) sg_actor_gradients = optimizer.Gradients(sg_actor_network.weights, **actor_grad_args) sg_critic_loss = loss.DDPGLoss(sg_critic_network, cfg.config) critic_grad_args = dict(loss=sg_critic_loss) if cfg.config.no_ps: sg_critic_optimizer = optimizer.AdamOptimizer(cfg.config.critic_learning_rate) critic_grad_args.update(dict(optimizer=sg_critic_optimizer)) sg_critic_gradients = optimizer.Gradients(sg_critic_network.weights, **critic_grad_args) sg_critic_action_gradients = optimizer.Gradients(sg_critic_network.ph_action, loss=sg_critic_network.critic) # Expose public API self.op_assign_actor_weights = self.Op(sg_actor_network.weights.assign, weights=sg_actor_network.weights.ph_weights) self.op_assign_critic_weights = self.Op(sg_critic_network.weights.assign, weights=sg_critic_network.weights.ph_weights) self.op_assign_actor_target_weights = self.Op(sg_actor_target_network.weights.assign, weights=sg_actor_target_network.weights.ph_weights) self.op_assign_critic_target_weights = self.Op(sg_critic_target_network.weights.assign, weights=sg_critic_target_network.weights.ph_weights) self.op_get_action = self.Op(sg_actor_network.actor, state=sg_actor_network.ph_state) self.op_get_critic_q = self.Op(sg_critic_network.critic, state=sg_critic_network.ph_state, action=sg_critic_network.ph_action) self.op_get_actor_target = self.Op(sg_actor_target_network.actor, state=sg_actor_target_network.ph_state) self.op_get_critic_target = self.Op(sg_critic_target_network.critic, state=sg_critic_target_network.ph_state, action=sg_critic_target_network.ph_action) self.op_compute_actor_gradients = self.Op(sg_actor_gradients.calculate, state=sg_actor_network.ph_state, grad_ys=ph_action_gradient) self.op_compute_critic_gradients = self.Op(sg_critic_gradients.calculate, state=sg_critic_network.ph_state, action=sg_critic_network.ph_action, predicted=sg_critic_loss.ph_predicted) self.op_compute_critic_action_gradients = self.Op(sg_critic_action_gradients.calculate, state=sg_critic_network.ph_state, action=sg_critic_network.ph_action) # Integrated with grad computation by log_lvl self.op_critic_loss = self.Op(sg_critic_loss, state=sg_critic_network.ph_state, action=sg_critic_network.ph_action, predicted=sg_critic_loss.ph_predicted) self.op_compute_norm_actor_gradients = self.Op(sg_actor_gradients.global_norm, state=sg_actor_network.ph_state, grad_ys=ph_action_gradient) self.op_compute_norm_critic_gradients = self.Op(sg_critic_gradients.global_norm, state=sg_critic_network.ph_state, action=sg_critic_network.ph_action, predicted=sg_critic_loss.ph_predicted) self.op_compute_norm_critic_action_gradients = self.Op(sg_critic_action_gradients.global_norm, state=sg_critic_network.ph_state, action=sg_critic_network.ph_action) if cfg.config.no_ps: sg_actor_weights = sg_actor_network.weights sg_critic_weights = sg_critic_network.weights sg_actor_target_weights = sg_actor_target_network.weights sg_critic_target_weights = sg_critic_target_network.weights # needs reassign weights from actor & critic to target networks sg_init_actor_target_weights = \ graph.AssignWeights(sg_actor_target_weights, sg_actor_weights).op sg_init_critic_target_weights = \ graph.AssignWeights(sg_critic_target_weights, sg_critic_weights).op sg_update_actor_target_weights = \ graph.AssignWeights(sg_actor_target_weights, sg_actor_weights, cfg.config.tau).op sg_update_critic_target_weights = \ graph.AssignWeights(sg_critic_target_weights, sg_critic_weights, cfg.config.tau).op self.op_get_weights = self.Ops(sg_actor_weights, sg_actor_target_weights, sg_critic_weights, sg_critic_target_weights) self.op_init_target_weights = self.Ops(sg_init_actor_target_weights, sg_init_critic_target_weights) self.op_update_target_weights = self.Ops(sg_update_actor_target_weights, sg_update_critic_target_weights) self.op_apply_actor_gradients = self.Ops(sg_actor_gradients.apply, gradients=sg_actor_gradients.ph_gradients) self.op_apply_critic_gradients = self.Op(sg_critic_gradients.apply, gradients=sg_critic_gradients.ph_gradients) sg_initialize = graph.Initialize() self.op_initialize = self.Op(sg_initialize)
def build_graph(self): # Build graph sg_global_step = graph.GlobalStep() sg_episode_cnt = graph.GlobalStep() sg_actor_weights = ActorNetwork().weights sg_critic_weights = CriticNetwork().weights sg_actor_target_weights = ActorNetwork().weights sg_critic_target_weights = CriticNetwork().weights sg_get_weights_flatten = \ graph.GetVariablesFlatten(graph.Variables(sg_actor_weights, sg_critic_weights)) sg_set_weights_flatten = \ graph.SetVariablesFlatten(graph.Variables(sg_actor_weights, sg_critic_weights)) # needs reassign weights from actor & critic to target networks sg_init_actor_target_weights = \ graph.AssignWeights(sg_actor_target_weights, sg_actor_weights).op sg_init_critic_target_weights = \ graph.AssignWeights(sg_critic_target_weights, sg_critic_weights).op sg_update_actor_target_weights = \ graph.AssignWeights(sg_actor_target_weights, sg_actor_weights, cfg.config.tau).op sg_update_critic_target_weights = \ graph.AssignWeights(sg_critic_target_weights, sg_critic_weights, cfg.config.tau).op sg_actor_optimizer = optimizer.AdamOptimizer(cfg.config.actor_learning_rate) sg_critic_optimizer = optimizer.AdamOptimizer(cfg.config.critic_learning_rate) sg_actor_gradients = optimizer.Gradients(sg_actor_weights, optimizer=sg_actor_optimizer) sg_critic_gradients = optimizer.Gradients(sg_critic_weights, optimizer=sg_critic_optimizer) sg_average_reward = graph.LinearMovingAverage(cfg.config.avg_in_num_batches) sg_initialize = graph.Initialize() # Expose public API self.op_get_weights_signed = self.Ops(sg_actor_weights, sg_actor_target_weights, sg_critic_weights, sg_critic_target_weights, sg_global_step.n) self.op_get_weights_flatten = self.Op(sg_get_weights_flatten) self.op_set_weights_flatten = self.Op(sg_set_weights_flatten, value=sg_set_weights_flatten.ph_value) self.op_init_target_weights = self.Ops(sg_init_actor_target_weights, sg_init_critic_target_weights) self.op_update_target_weights = self.Ops(sg_update_actor_target_weights, sg_update_critic_target_weights) self.op_apply_gradients = self.Ops(sg_actor_gradients.apply, sg_critic_gradients.apply, sg_global_step.increment, gradients=(sg_actor_gradients.ph_gradients, sg_critic_gradients.ph_gradients), increment=sg_global_step.ph_increment) self.op_add_rewards_to_model_score_routine = self.Ops(sg_average_reward.add, reward_sum=sg_average_reward.ph_sum, reward_weight=sg_average_reward.ph_count) self.op_score = self.Op(sg_average_reward.average) self.op_n_step = self.Op(sg_global_step.n) self.op_inc_step = self.Op(sg_global_step.increment, increment=sg_global_step.ph_increment) self.op_get_episode_cnt = self.Op(sg_episode_cnt.n) self.op_inc_episode_cnt = self.Op(sg_episode_cnt.increment, increment=sg_episode_cnt.ph_increment) self.op_submit_gradients = self.Call(graph.get_gradients_apply_routine(cfg.config)) self.op_initialize = self.Op(sg_initialize)