Esempio n. 1
0
                                                                      var_scope='online_act_val')

    act_val_net_target = action_value_net()
    act_val_target, vars_target = act_val_net_online.build_graph(img_state=online_img_state, n_action=21,
                                                                      is_training=True,
                                                                      var_scope='target_act_val')
    #########################################
    ## the best action ops in current step ##
    #########################################
    max_action_index_online = tf.argmax(act_val_online, axis=-1)
    max_action_index_target = tf.argmax(act_val_target, axis=-1)

    ###################################
    ### hard copy ops for first init###
    ###################################
    update_target_ops = rl_tools.copy_a2b(vars_a=vars_online, vars_b=vars_target)

    ###########
    ## q loss##
    ###########
    max_q_val_target = tf.reduce_sum(act_val_target * tf.one_hot(max_action_index_target, FLAGS.n_action), axis=-1) ## need img_state_target
    q_val_online = tf.reduce_sum(act_val_online * tf.one_hot(real_action_index, FLAGS.n_action), axis=-1) ## need img_state_online, real_action_index
    q_loss = tf.reduce_mean(tf.square(reward + (1.-whether_end)*max_q_val_target - q_val_online)) ## need reward,  whether_end

    ###############
    ## update #####
    ############
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        optimizer_for_online = tf.train.RMSPropOptimizer(learning_rate=lr)
        q_gradients_vars = optimizer_for_online.compute_gradients(q_loss, var_list=vars_online)
Esempio n. 2
0
        n_action_space=1,
        is_training=True,
        action_range=action_range,
        var_scope='online_actor')

    online_critic = ddpg.critic(max_abs_q_val=40)
    q_online, online_critic_vars = online_critic.build_graph(
        img_state=online_img_state,
        action=action_online,
        is_training=True,
        var_scope='online_critic')

    ###################################
    ### hard copy ops for first init###
    ###################################
    actor_hard_copy_ops = rl_tools.copy_a2b(online_actor_vars,
                                            target_actor_vars)
    critic_hard_copy_ops = rl_tools.copy_a2b(online_critic_vars,
                                             target_critic_vars)

    ###################
    ### soft update ###
    ###################
    actor_soft_copy_ops = rl_tools.soft_copy_a2b(online_actor_vars,
                                                 target_actor_vars)
    critic_soft_copy_ops = rl_tools.soft_copy_a2b(online_critic_vars,
                                                  target_critic_vars)

    # #####################################
    # ## an ops for online actor update  ##
    # #####################################
    # take_action_ops = online_action.assign(action_online)