def load_act(path): with open(path, "rb") as f: model_data, act_params = cloudpickle.load(f) act = build_act(**act_params) sess = tf.Session() sess.__enter__() with tempfile.TemporaryDirectory() as td: arc_path = os.path.join(td, "packed.zip") with open(arc_path, "wb") as f: f.write(model_data) zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) load_state(os.path.join(td, "model")) return ActWrapper(act, act_params)
def build_train(make_obs_ph, q_func, num_actions, optimizer, pretrain_optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") precomputed_qvals = tf.placeholder(tf.float32, [None, num_actions], name="qval") # GL # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) ################ Added steps for Bayes-DDPG ############################ pretrain_target_error = tf.squared_difference(precomputed_qvals, q_t) pretrain_optimize_expr = pretrain_optimizer.minimize( tf.reduce_mean(pretrain_target_error), var_list=q_func_vars) ######################################################################## # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) ################ Added steps for Bayes-DDPG ############################ # update_target_fn may be called periodically to copy Q network to target Q network copy_target_to_q_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): copy_target_to_q_expr.append(var.assign(var_target)) copy_target_to_q_expr = tf.group(*copy_target_to_q_expr) ######################################################################## # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) ################ Added steps for Bayes-DDPG ############################ # pretrain target network train_target = U.function(inputs=[obs_t_input, precomputed_qvals], outputs=pretrain_target_error, updates=[pretrain_optimize_expr]) ######################################################################## update_target = U.function([], [], updates=[update_target_expr]) copy_target_to_q = U.function([], [], updates=[copy_target_to_q_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, train_target, copy_target_to_q, { 'q_values': q_values }
def actor_build(make_obs_ph, q_func, num_actions, net_list, scope="actor_deepq", reuse=None, param_noise=False, param_noise_filter_func=None): # similar with build_train(),用于创建actor的深度Q网络以及其它功能(分别运行在不同进程下,不用担心scope的问题) # in_list 为shared list, 用于存储trainer的q网络(np.array, 不是tensor) if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) # 与build不同, actor只需要前向传播即可 with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") # 创建Q network, 返回所有action 的q值(q_func()) , q_t是一个列表 # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act # q_func_vars 得到的是q network 中所有的变量参数?tf.get_collection() 以列表的形式获取(scope下)集合中的值 # 所以在进程间通信时,只需要传输q_func_vars就可以了 q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # 将q_func_vars 写到update_qfunc()下就无效了,why? placeholder_list = [] for item in net_list: placeholder_list.append( tf.placeholder(tf.float32, shape=item.shape)) # 不要将op定义到循环下去,会造成内存泄漏 update_qfunc_op = list() for var, var_target in zip(placeholder_list, q_func_vars): # op = tf.assign(var_target, var) update_qfunc_op.append(tf.assign(var_target, var)) # 操作子 # update_qfunc_op = tf.group(*update_qfunc_op) # group len_qfunc = len(update_qfunc_op) def update_qfunc(sess, net_list_lock): # 需要tf.variable_scope(scope, reuse=reuse): # 或使用tf.get_default_session()(不可用上下文管理器) with sess.as_default(): net_list_lock.acquire() # for op in update_qfunc_op: # sess.run(op) for i in range(0, len_qfunc): sess.run(update_qfunc_op[i], feed_dict={placeholder_list[i]: net_list[i]}) # sess.run(update_qfunc_op[i], # feed_dict={placeholder_list[i]: net_list[i+net_list_index.value*len_qfunc]}) # for next_var_target in q_func_vars: # print(next_var_target.eval(session=sess)) # print(net_list) net_list_lock.release() # 释放 # 下面是几个不同的实现版本,或多或少的有一些问题 # start right # update_qfunc_op = list() # for var, var_target in zip(net_list, q_func_vars): # # op = tf.assign(var_target, var) # update_qfunc_op.append(tf.assign(var_target, var)) # update_qfunc_op = tf.group(*update_qfunc_op) # group # # def update_qfunc(sess, net_list_lock): # # print('update_qfunc') # # 需要tf.variable_scope(scope, reuse=reuse): # # 或使用tf.get_default_session()(不可用上下文管理器) # with sess.as_default(): # net_list_lock.acquire() # sess.run(update_qfunc_op) # # for i in range(0,len(update_qfunc_op)): # # sess.run(update_qfunc_op[i]) # # for next_var_target in q_func_vars: # # print(next_var_target.eval(session=sess)) # # print(net_list) # net_list_lock.release() # # print('end_update_qfunc') # end right # update_qfunc_op = list() # for var, var_target in zip(net_list, q_func_vars): # # op = tf.assign(var_target, var) # update_qfunc_op.append(tf.assign(var_target, var)) # def update_qfunc(sess, net_list_lock): # # print('update_qfunc') # # 需要tf.variable_scope(scope, reuse=reuse): # # 或使用tf.get_default_session()(不可用上下文管理器) # with sess.as_default(): # net_list_lock.acquire() # for var, var_target in zip(net_list, q_func_vars): # op = tf.assign(var_target, var) # sess.run(op) # # for i in range(0,len(update_qfunc_op)): # # sess.run(update_qfunc_op[i]) # # for next_var_target in q_func_vars: # # print(next_var_target.eval(session=sess)) # # print(net_list) # net_list_lock.release() # # print('end_update_qfunc') # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # # print(q_func_vars) # # # 下面update_qfunc还有待商议(U.function),或许用get_session()单独实现一个函数 # update_qfunc_expr = [] # net_list_lock.acquire() # for var, var_target in zip(in_list, q_func_vars): # update_qfunc_expr.append(var_target.assign(var)) # # print(update_target_expr) # 大概就是一系列Assign操作 # update_qfunc_expr = tf.group(*update_qfunc_expr) # tf.group()将语句变为操作? # # print(update_target_expr) # update_qfunc = U.function([], [], updates=[update_qfunc_expr]) # q_values = U.function([obs_t_input], q_t) return act_f, update_qfunc # , {'q_values': q_values}