var_scope='online_act_val') act_val_net_target = action_value_net() act_val_target, vars_target = act_val_net_online.build_graph(img_state=online_img_state, n_action=21, is_training=True, var_scope='target_act_val') ######################################### ## the best action ops in current step ## ######################################### max_action_index_online = tf.argmax(act_val_online, axis=-1) max_action_index_target = tf.argmax(act_val_target, axis=-1) ################################### ### hard copy ops for first init### ################################### update_target_ops = rl_tools.copy_a2b(vars_a=vars_online, vars_b=vars_target) ########### ## q loss## ########### max_q_val_target = tf.reduce_sum(act_val_target * tf.one_hot(max_action_index_target, FLAGS.n_action), axis=-1) ## need img_state_target q_val_online = tf.reduce_sum(act_val_online * tf.one_hot(real_action_index, FLAGS.n_action), axis=-1) ## need img_state_online, real_action_index q_loss = tf.reduce_mean(tf.square(reward + (1.-whether_end)*max_q_val_target - q_val_online)) ## need reward, whether_end ############### ## update ##### ############ update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimizer_for_online = tf.train.RMSPropOptimizer(learning_rate=lr) q_gradients_vars = optimizer_for_online.compute_gradients(q_loss, var_list=vars_online)
n_action_space=1, is_training=True, action_range=action_range, var_scope='online_actor') online_critic = ddpg.critic(max_abs_q_val=40) q_online, online_critic_vars = online_critic.build_graph( img_state=online_img_state, action=action_online, is_training=True, var_scope='online_critic') ################################### ### hard copy ops for first init### ################################### actor_hard_copy_ops = rl_tools.copy_a2b(online_actor_vars, target_actor_vars) critic_hard_copy_ops = rl_tools.copy_a2b(online_critic_vars, target_critic_vars) ################### ### soft update ### ################### actor_soft_copy_ops = rl_tools.soft_copy_a2b(online_actor_vars, target_actor_vars) critic_soft_copy_ops = rl_tools.soft_copy_a2b(online_critic_vars, target_critic_vars) # ##################################### # ## an ops for online actor update ## # ##################################### # take_action_ops = online_action.assign(action_online)