コード例 #1
0
ファイル: deepq.py プロジェクト: g-and-j/baselines
    def load_act(path):
        with open(path, "rb") as f:
            model_data, act_params = cloudpickle.load(f)
        act = build_act(**act_params)
        sess = tf.Session()
        sess.__enter__()
        with tempfile.TemporaryDirectory() as td:
            arc_path = os.path.join(td, "packed.zip")
            with open(arc_path, "wb") as f:
                f.write(model_data)

            zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
            load_state(os.path.join(td, "model"))

        return ActWrapper(act, act_params)
コード例 #2
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                pretrain_optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")
        precomputed_qvals = tf.placeholder(tf.float32, [None, num_actions],
                                           name="qval")  # GL

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error,
                                                    var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        ################ Added steps for Bayes-DDPG ############################
        pretrain_target_error = tf.squared_difference(precomputed_qvals, q_t)
        pretrain_optimize_expr = pretrain_optimizer.minimize(
            tf.reduce_mean(pretrain_target_error), var_list=q_func_vars)
        ########################################################################

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        ################ Added steps for Bayes-DDPG ############################
        # update_target_fn may be called periodically to copy Q network to target Q network
        copy_target_to_q_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            copy_target_to_q_expr.append(var.assign(var_target))
        copy_target_to_q_expr = tf.group(*copy_target_to_q_expr)
        ########################################################################

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=td_error,
                           updates=[optimize_expr])

        ################ Added steps for Bayes-DDPG ############################
        # pretrain target network
        train_target = U.function(inputs=[obs_t_input, precomputed_qvals],
                                  outputs=pretrain_target_error,
                                  updates=[pretrain_optimize_expr])
        ########################################################################

        update_target = U.function([], [], updates=[update_target_expr])
        copy_target_to_q = U.function([], [], updates=[copy_target_to_q_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, train_target, copy_target_to_q, {
            'q_values': q_values
        }
コード例 #3
0
def actor_build(make_obs_ph,
                q_func,
                num_actions,
                net_list,
                scope="actor_deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    # similar with build_train(),用于创建actor的深度Q网络以及其它功能(分别运行在不同进程下,不用担心scope的问题)
    # in_list 为shared list, 用于存储trainer的q网络(np.array, 不是tensor)

    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse)
    # 与build不同, actor只需要前向传播即可
    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        # 创建Q network, 返回所有action 的q值(q_func()) , q_t是一个列表
        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        # q_func_vars 得到的是q network 中所有的变量参数?tf.get_collection() 以列表的形式获取(scope下)集合中的值
        # 所以在进程间通信时,只需要传输q_func_vars就可以了
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func")
        # 将q_func_vars 写到update_qfunc()下就无效了,why?
        placeholder_list = []
        for item in net_list:
            placeholder_list.append(
                tf.placeholder(tf.float32, shape=item.shape))
        # 不要将op定义到循环下去,会造成内存泄漏
        update_qfunc_op = list()
        for var, var_target in zip(placeholder_list, q_func_vars):
            # op = tf.assign(var_target, var)
            update_qfunc_op.append(tf.assign(var_target, var))  # 操作子
        # update_qfunc_op = tf.group(*update_qfunc_op)  # group
        len_qfunc = len(update_qfunc_op)

        def update_qfunc(sess, net_list_lock):
            # 需要tf.variable_scope(scope, reuse=reuse):
            # 或使用tf.get_default_session()(不可用上下文管理器)
            with sess.as_default():
                net_list_lock.acquire()
                # for op in update_qfunc_op:
                #     sess.run(op)
                for i in range(0, len_qfunc):
                    sess.run(update_qfunc_op[i],
                             feed_dict={placeholder_list[i]: net_list[i]})
                    # sess.run(update_qfunc_op[i],
                    #          feed_dict={placeholder_list[i]: net_list[i+net_list_index.value*len_qfunc]})
                # for next_var_target in q_func_vars:
                #     print(next_var_target.eval(session=sess))
                # print(net_list)
                net_list_lock.release()  # 释放

        # 下面是几个不同的实现版本,或多或少的有一些问题
        # start right
        # update_qfunc_op = list()
        # for var, var_target in zip(net_list, q_func_vars):
        #     # op = tf.assign(var_target, var)
        #     update_qfunc_op.append(tf.assign(var_target, var))
        # update_qfunc_op = tf.group(*update_qfunc_op)  # group
        #
        # def update_qfunc(sess, net_list_lock):
        #     # print('update_qfunc')
        #     # 需要tf.variable_scope(scope, reuse=reuse):
        #     # 或使用tf.get_default_session()(不可用上下文管理器)
        #     with sess.as_default():
        #         net_list_lock.acquire()
        #         sess.run(update_qfunc_op)
        #         # for i in range(0,len(update_qfunc_op)):
        #         #     sess.run(update_qfunc_op[i])
        #         # for next_var_target in q_func_vars:
        #         #     print(next_var_target.eval(session=sess))
        #         # print(net_list)
        #         net_list_lock.release()
        #     # print('end_update_qfunc')
        # end right

        # update_qfunc_op = list()
        # for var, var_target in zip(net_list, q_func_vars):
        #     # op = tf.assign(var_target, var)
        #     update_qfunc_op.append(tf.assign(var_target, var))

        # def update_qfunc(sess, net_list_lock):
        #     # print('update_qfunc')
        #     # 需要tf.variable_scope(scope, reuse=reuse):
        #     # 或使用tf.get_default_session()(不可用上下文管理器)
        #     with sess.as_default():
        #         net_list_lock.acquire()
        #         for var, var_target in zip(net_list, q_func_vars):
        #             op = tf.assign(var_target, var)
        #             sess.run(op)
        #         # for i in range(0,len(update_qfunc_op)):
        #         #     sess.run(update_qfunc_op[i])
        #         # for next_var_target in q_func_vars:
        #         #     print(next_var_target.eval(session=sess))
        #         # print(net_list)
        #         net_list_lock.release()
        #     # print('end_update_qfunc')

        # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func")
        # # print(q_func_vars)
        #
        # # 下面update_qfunc还有待商议(U.function),或许用get_session()单独实现一个函数
        # update_qfunc_expr = []
        # net_list_lock.acquire()
        # for var, var_target in zip(in_list, q_func_vars):
        #     update_qfunc_expr.append(var_target.assign(var))
        # # print(update_target_expr) # 大概就是一系列Assign操作
        #     update_qfunc_expr = tf.group(*update_qfunc_expr)  # tf.group()将语句变为操作?
        # # print(update_target_expr)
        # update_qfunc = U.function([], [], updates=[update_qfunc_expr])

        # q_values = U.function([obs_t_input], q_t)

        return act_f, update_qfunc  # , {'q_values': q_values}