def training_loss(op_rollout, s_history, a_history, v_history, r_history, nenvs, nstep, training_depth=1): state_shape = s_history.shape.as_list()[1:] # for each envs, carry out same action for nstep r_vi, v_vi, s_vi = [], [], [] # value iteration: expand for all possible action l = tf.expand_dims(tf.range(0, nenvs), 1) l = tf.concat([l, tf.tile([[0]], [nenvs, 1])], axis=1) s = tf.gather_nd(tf.reshape(s_history, [nenvs, nstep, -1]), l) a = tf.gather_nd(tf.reshape(a_history, [nenvs, nstep]), l) for i in range(training_depth): s_vi.append(s) r, v, s = op_rollout(s, a) r_vi.append(r) v_vi.append(v) r_vi = tf.stack(r_vi, axis=1) v_vi = tf.stack(v_vi, axis=1) s_vi = tf.stack(s_vi, axis=1) s_history = tf.reshape(s_history, [nenvs, 1, nstep, -1]) v_history = tf.reshape(v_history, [nenvs, 1, nstep]) r_history = tf.reshape(r_history, [nenvs, 1, nstep]) s_vi = tf.reshape(s_vi, [nenvs, training_depth, 1, -1]) v_vi = tf.reshape(v_vi, [nenvs, training_depth, 1]) r_vi = tf.reshape(r_vi, [nenvs, training_depth, 1]) # use the upper triangular part idx = np.flip(np.triu(np.ones([training_depth, nstep])), 1) idx = np.where(idx.reshape([-1]))[0] l = np.repeat(np.arange(nenvs), idx.size) l = np.stack([l, np.tile(idx, nenvs)], axis=1) s_mat = tf.gather_nd( tf.reshape(s_history - s_vi, [nenvs, training_depth * nstep, state_shape[-1]]), l) r_mat = tf.gather_nd(tf.reshape(r_history - r_vi, [nenvs, -1]), l) v_mat = tf.gather_nd(tf.reshape(v_history - v_vi, [nenvs, -1]), l) # # bn before loss # r_mat = r_bn(r_mat) # v_mat = v_bn(v_mat) # compute loss s_loss = tf.math.reduce_mean(huber_loss(s_mat)) r_loss = tf.math.reduce_mean(huber_loss(r_mat)) v_loss = tf.math.reduce_mean(huber_loss(v_mat)) return r_loss + v_loss # + s_loss
def _build_train_reward_func(self, reward_func, observation_input_ph, action_input_ph, optimizer): with tf.variable_scope("reward_func_optimizer"): true_rewards_ph = tf.placeholder(tf.float32, [None], name="true_rewards") #loss = tf.metrics.mean_squared_error(reward_func, true_rewards_ph) true_rewards = tf.expand_dims(true_rewards_ph, axis=1) #loss = tf.reduce_mean(tf.losses.huber_loss(reward_func, true_rewards), name = "loss") # Maybe a bit more robust. errors = reward_func - true_rewards loss = tf.reduce_mean(tf_utils.huber_loss(errors), name="loss") gradients = optimizer.compute_gradients(loss) for i, (grad, var) in enumerate(gradients): gradients[i] = (tf.clip_by_norm(grad, self.grad_norm_clipping), var) train_reward_func = optimizer.apply_gradients(gradients) return errors, train_reward_func, true_rewards_ph
def learn(env, q_func, policy_fn, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) scope = "ampi" reuse=None grad_norm_clipping=None num_actions=env.action_space.n optimizer_q=tf.train.AdamOptimizer(learning_rate=lr) optimizer_pi=tf.train.AdamOptimizer(learning_rate=lr) act = build_act(make_obs_ph, q_func, num_actions=env.action_space.n, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # add ob_space = env.observation_space ac_space = env.action_space pi, act = policy_fn(obs_t_input.get(), ob_space, ac_space, scope="pi_func") # train pi pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/pi_func") pi_tp1, act_tp1 = policy_fn(obs_tp1_input.get(), ob_space, ac_space, scope="target_pi_func") # target pi target_pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/taget_pi_func") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # Q_{train}(a,s) q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # y_j act_best = tf.argmax(pi, axis=1) # argmax \pi(s_{j+1}) q_tp1_sampled = tf.reduce_sum(q_tp1 * tf.one_hot(act_best, num_actions), 1) # Q_{target}(s_{j+1}, argmax(\pi(s_{j+1})) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_sampled q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # Regression loss td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # argmax_a Q_{target}(s_j, a) z_j = tf.argmax(q_tp1, axis=1) # max Q(s',a') # classification loss cl_error = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi, labels=z_j) # Q optimization if grad_norm_clipping is not None: gradients_q = optimizer_q.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients_qq): if grad is not None: gradients_q[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_q = optimizer_q.apply_gradients(gradients_q) else: optimize_q = optimizer_q.minimize(weighted_error, var_list=q_func_vars) # pi optimization if grad_norm_clipping is not None: gradients_pi = optimizer_pi.compute_gradients(cl_error, var_list=pi_func_vars) for i, (grad, var) in enumerate(gradients_pi): if grad is not None: gradients_pi[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_pi = optimizer_pi.apply_gradients(gradients_pi) else: optimize_pi = optimizer_pi.minimize(cl_error, var_list=pi_func_vars) # update_target Q update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # update_target pi update_target_pi = [] for var, var_target in zip(sorted(pi_func_vars, key=lambda v: v.name), sorted(target_pi_func_vars, key=lambda v: v.name)): update_target_pi.append(var_target.assign(var)) update_target_pi = tf.group(*update_target_pi) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[td_error, cl_error], updates=[optimize_q, optimize_pi] ) update_target = U.function([], [], updates=[update_target_expr, update_target_pi]) q_values = U.function([obs_t_input], q_t) debug = {'q_values': q_values} # Create the replay buffer replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = env.action_space.sample() # not used, just so we have the datatype stochastic=True ac1, vpred1 = act(stochastic, np.array(obs)[None]) action = ac1[0] #action, _ = pi.act(stochastic, obs) #action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # Log train and res mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_state(model_file) return act
def __init__(self, inputs: TrainInputs, action_space, observation_space): act_size = action_space.n optimizer = tf.train.AdamOptimizer(learning_rate=inputs.lr) with tf.variable_scope( 'q_func' ): # child scopes of reusable parent scope are reusable self.runner = q_policy(obs=inputs.s0, epsilon=inputs.eps, action_space=action_space) with tf.variable_scope( 'q_func', reuse=True ): # child scopes of reusable parent scope are reusable q_net = q_policy(obs=inputs.s0, epsilon=inputs.eps, action_space=action_space) with tf.variable_scope('target_q_func'): target_q_net = q_policy(obs=inputs.s1, epsilon=inputs.eps, action_space=action_space) update_target_op = tf.group(*[ tf.assign(a, b) for a, b in zip(target_q_net.trainables, q_net.trainables) ]) if G.double_q: with tf.variable_scope( 'q_func', reuse=True ): # child scopes of reusable parent scope are reusable inner_q_net = q_policy(obs=inputs.s1, epsilon=inputs.eps, action_space=action_space) with tf.variable_scope('Q_training'): q_sampled = tf.reduce_sum(q_net.q_values * tf.one_hot(inputs.act, act_size), axis=1) if G.double_q: q_asterisk = tf.reduce_sum( target_q_net.q_values * tf.one_hot(inner_q_net.act_argmax, act_size), axis=1) else: q_asterisk = tf.reduce_max(target_q_net.q_values, axis=1) # compute RHS of bellman equation T_q = inputs.rew + (1.0 - inputs.done_mask_ph) * G.gamma * q_asterisk # compute the error (potentially clipped) td_error = q_sampled - tf.stop_gradient(T_q) _ = U.huber_loss(td_error) if G.prioritized_replay: loss = tf.reduce_mean(inputs.sample_weights * _) else: loss = tf.reduce_mean(_) # compute optimization op (potentially with gradient clipping) if G.grad_norm_clipping: optimize_op = U.minimize_and_clip( optimizer, loss, var_list=q_net.trainables, clip_val=G.grad_norm_clipping) else: optimize_op = optimizer.minimize(loss, var_list=q_net.trainables) def train(*, s0s, actions, rewards, s1s, dones, sample_weights=None): # read: SARSA feed_dict = { inputs.lr: G.learning_rate, inputs.s0: s0s, inputs.act: actions, inputs.rew: rewards, inputs.s1: s1s, inputs.done_mask_ph: dones } if G.prioritized_replay: assert sample_weights is not None, "sample_weights is required when prioritized_replay is ON." feed_dict[inputs.sample_weights] = sample_weights td_error_val, loss_val, _ = U.get_session().run( [td_error, loss, optimize_op], feed_dict) return td_error_val, loss_val def update_target(): U.get_session().run(update_target_op) self.train = train self.update_target = update_target
def build_train(make_obs_ph, make_bel_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f = build_act(make_obs_ph, make_bel_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") bel_t_input = make_bel_ph("bel_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") expert_qval_ph = tf.placeholder(tf.float32, [None, num_actions], name="expert_qval_t") obs_tp1_input = make_obs_ph("obs_tp1") bel_tp1_input = make_bel_ph("bel_tp1") expert_qval_tp1_ph = tf.placeholder(tf.float32, [None, num_actions], name="expert_qval_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), bel_t_input.get(), expert_qval_ph, num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), bel_tp1_input.get(), expert_qval_tp1_ph, num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. one_hot_action = tf.one_hot(act_t_ph, num_actions) q_t_selected = tf.reduce_sum(q_t * one_hot_action, axis=1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), bel_tp1_input.get(), expert_qval_tp1_ph, num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), axis=1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) #q_t_selected = tf.Print(q_t_selected, [q_t_selected], '>>>> QT :', summarize=3) #q_t_selected_target = tf.Print(q_t_selected_target, [q_t_selected_target], '>>>> QT_Target :', summarize=3) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(tf.reduce_mean(td_error, axis=0)) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: # grad = tf.Print(grad, [grad], '>>>> grad: ', summarize=10) gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, bel_t_input, expert_qval_ph, act_t_ph, rew_t_ph, obs_tp1_input, bel_tp1_input, expert_qval_tp1_ph, done_mask_ph, importance_weights_ph, ], outputs=td_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """ Creates the train function: :param make_obs_ph: (function (str): TensorFlow Tensor) a function that takes a name and creates a placeholder of input with that name :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor) the model that takes the following inputs: - observation_in: (Any) the output of observation placeholder - num_actions: int number of actions - scope: (str) - reuse: (bool) should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. :param num_actions: (int) number of actions :param reuse: (bool) whether or not to reuse the graph variables :param optimizer: (tf.train.Optimizer) optimizer to use for the Q-learning objective. :param grad_norm_clipping: (float) clip gradient norms to this value. If None no clipping is performed. :param gamma: (float) discount rate. :param double_q: (bool) if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. :param scope: (str or VariableScope) optional scope for variable_scope. :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given. :param param_noise: (bool) whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. :return: (tuple) act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) function to select and action given observation. See the top of the file for details. train: (function (Any, numpy float, numpy float, Any, numpy bool, numpy float): numpy float) optimize the error in Bellman's equation. See the top of the file for details. update_target: (function) copy the parameters from optimized Q function to the target Q function. See the top of the file for details. debug: ({str: function}) a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = tf_utils.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = tf_utils.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) update_target = tf_utils.function([], [], updates=[update_target_expr]) q_values = tf_utils.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_train(make_obs_ph, q_func, num_actions, optimizer, avg_reward_learning_rate=0.0001, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepr", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # Actions in output grid that are not valid are neginf unused_actions_mask = tf.placeholder(tf.float32, [None, num_actions], name="unused_actions_mask") rew_avg = tf.Variable(0., name='rew_avg') rew_avg_next = tf.Variable(0., name='rew_avg_next') # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) ##row_ids = tf.range(tf.shape(q_t)[0]) ##idx = tf.stack([row_ids, act_t_ph], axis=1) ##q_t_selected = tf.gather_nd(tf.reshape(q_t, [-1, num_actions]), idx) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) # Filter out unused actions #q_tp1_using_online_net = tf.boolean_mask(q_tp1_using_online_net, 1-unused_actions_mask, axis=1) #q_tp1 = tf.boolean_mask(q_tp1, 1-unused_actions_mask, axis=1) #q_t_filtered = tf.boolean_mask(q_t, 1-unused_actions_mask, axis=1) q_tp1_using_online_net = q_tp1_using_online_net + unused_actions_mask # Best q's -- useful for deciding whether to update R Learning # q_t_filtered = q_t + unused_actions_mask # q_t_best = tf.reduce_max(q_t_filtered, 1) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) ##q_tp1_best_using_online_net = tf.argmax(tf.reshape(q_tp1_using_online_net, [-1, num_actions]), 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, tf.shape(q_tp1)[1]), 1) ##idx = tf.stack([tf.cast(row_ids, tf.int64), q_tp1_best_using_online_net], axis=1) ##q_tp1_best = tf.gather_nd(tf.reshape(q_tp1, [-1, num_actions]), idx) else: q_tp1_best = tf.argmax(q_tp1 + unused_actions_mask, 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best, tf.shape(q_tp1)[1]), 1) #q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation ##q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked #with tf.control_dependencies([rew_avg.assign(rew_avg_next), tf.print(rew_avg)]): with tf.control_dependencies([rew_avg.assign(rew_avg_next)]): q_t_selected_target = rew_t_ph - rew_avg + q_tp1_best_masked # compute the error (potentially clipped) ##td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) # For R Learning td_error = tf.stop_gradient(q_t_selected_target) - q_t_selected errors = U.huber_loss(td_error) #errors = tf.losses.mean_squared_error(labels=tf.stop_gradient(q_t_selected_target), predictions=q_t_selected) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # R Learning tf.summary.scalar('rew_avg', rew_avg) #use_for_reward = tf.cast(tf.abs(q_t_selected - q_t_best) < 0.10*tf.abs(q_t_best), tf.float32) #use_for_reward = tf.cast(tf.abs(q_t_selected - q_t_best) < 1e-6, tf.float32) #num_valid_rewards = tf.reduce_sum(use_for_reward) #with tf.control_dependencies([tf.print(num_valid_rewards)]): #rew_avg_next_op = rew_avg_next.assign_add(tf.cond(num_valid_rewards > 0, # lambda: avg_reward_learning_rate*(1/num_valid_rewards)*tf.reduce_sum(use_for_reward * td_error), # lambda: 0.0)) rew_avg_next_op = rew_avg_next.assign_add(avg_reward_learning_rate*tf.reduce_mean(td_error)) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) with tf.control_dependencies([rew_avg_next_op]): optimize_expr = optimizer.apply_gradients(gradients) else: with tf.control_dependencies([rew_avg_next_op]): optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, unused_actions_mask ], #outputs=td_error, outputs=[tf.summary.histogram("rewards", rew_t_ph, collections=[]), weighted_error, tf.reduce_mean(q_t_selected), tf.reduce_mean(q_t_selected_target)], updates=[optimize_expr] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) #max_q_values = U.function([obs_t_input, unused_actions_mask], tf.reduce_max(tf.boolean_mask(q_t, 1-unused_actions_mask, axis=1), 1)) max_q_values = U.function([obs_t_input, unused_actions_mask], tf.reduce_max(q_t + unused_actions_mask, 1)) return act_f, train, update_target, {'q_values': q_values, 'max_q_values': max_q_values}
def build_train(make_obs_ph, q_func, num_actions, optimizer, train_gaze, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="DeepqWithGaze", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") initial_freeze_phase_ph = tf.placeholder(tf.bool, (), name="initial_freeze_phase") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = gflag.qfunc_models.get( "q_func").weights # already includes gaze_models weights q_func_trainable_vars = [ w for w in gflag.qfunc_models.get("q_func").trainable_weights \ if (train_gaze or w not in gflag.gaze_models.get("q_func").trainable_weights) ] # train_gaze=False excludes gaze model's weight # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = gflag.qfunc_models.get( "target_q_func").weights # already includes gaze_models weights # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) initial_freeze_weights = gflag.qfunc_models.get_weight_names_for_initial_freeze( model_name="q_func") q_func_trainable_vars_for_initial_freeze = list( filter(lambda w: w.name not in initial_freeze_weights, q_func_trainable_vars)) if grad_norm_clipping is not None: optimize_expr_for_initial_freeze = lambda: U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_trainable_vars_for_initial_freeze, clip_val=grad_norm_clipping) \ if q_func_trainable_vars_for_initial_freeze else tf.no_op() optimize_expr_after_freeze = lambda: U.minimize_and_clip( optimizer, weighted_error, var_list=q_func_trainable_vars, clip_val=grad_norm_clipping) else: # must put the operation under lambda, if you fully read tf.cond()'s documentation optimize_expr_for_initial_freeze = lambda: optimizer.minimize( weighted_error, var_list=q_func_trainable_vars_for_initial_freeze) optimize_expr_after_freeze = lambda: optimizer.minimize( weighted_error, var_list=q_func_trainable_vars) optimize_expr = tf.cond(initial_freeze_phase_ph, optimize_expr_for_initial_freeze, optimize_expr_after_freeze) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] assert len(q_func_vars) == len(target_q_func_vars) for var, var_target in zip(q_func_vars, target_q_func_vars): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, initial_freeze_phase_ph, ], outputs=td_error, updates=[optimize_expr], givens={K.backend.learning_phase(): 1}) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) # For tensorboard merged = tf.summary.merge([ tf.summary.image('img_curframe', obs_t_input.get()), tf.summary.image( 'gaze_curframe', q_func(obs_t_input.get(), num_actions, scope="q_func", return_gaze=True, reuse=True)) ]) tensorboard_summary = U.function( inputs=[obs_t_input], outputs=merged, givens={K.backend.learning_phase(): 0}) return act_f, train, update_target, { 'q_values': q_values }, tensorboard_summary
def build_train_neural_linear(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None, actor='target'): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if actor == 'target': act_f = build_act_thompson(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, actor="target_q_func") else: act_f = build_act_thompson(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, actor="q_func") with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation if actor == 'target': print("actor is target") q_t, phi_xt = q_func(obs_t_input.get(), num_actions, scope="q_func") q_tp1, phi_target_xtp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func", reuse=True) else: print("actor is dqn") q_t, phi_xt = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) q_tp1, phi_target_xtp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") last_layer_weights = q_func_vars[-2]#target_q_func_vars[-2] # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 # double dqn learning if double_q: print("building ddqn loss for neural linear") q_tp1_using_online_net, _ = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: print("building dqn loss for neural linear") q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # tf.matmul(tf.expand_dims(phi_xt,-1),tf.expand_dims(phi_xt,1)) # blr additions phiphiTop = tf.matmul(tf.transpose(phi_xt), phi_xt) phiYop = tf.squeeze(tf.matmul(tf.expand_dims(q_t_selected_target,0), phi_xt)) feat_dim = phi_xt.shape[1].value #carefull batch size here is actually action size precision_mat = tf.placeholder(tf.float64, [None] + [feat_dim, feat_dim], name="phiphiT") phiY = tf.placeholder(tf.float64, [None] + [feat_dim, 1], name="phiphiT") covariance_mat = tf.matrix_inverse(precision_mat) w_mu = tf.squeeze(tf.matmul(covariance_mat,phiY),axis=-1) w_ph = tf.placeholder(tf.float32, [None] + [num_actions, feat_dim], name="w") # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) target_q_values = U.function([obs_tp1_input], q_tp1) feat = U.function([obs_t_input], phi_xt) feat_target = U.function([obs_tp1_input], phi_target_xtp1) # Create callable functions blr_ops = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[phiphiTop,phiYop] ) blr_helpers = U.function([precision_mat,phiY],[covariance_mat, w_mu]) return act_f, train, update_target, feat_dim, feat, feat_target, target_q_values, last_layer_weights, blr_ops, blr_helpers
def build_train(make_obs_ph, mu_func, v_func, l_func, action_noise, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, mu_func, action_noise, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.float32, [None, num_actions], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") with tf.variable_scope("q_func"): l_t = l_func(obs_t_input.get(), int((num_actions * (num_actions + 1)) / 2), scope="l_func") mu_t = mu_func(obs_t_input.get(), num_actions, scope="mu_func", reuse=True) v_t = v_func(obs_t_input.get(), 1, scope="v_func") diagonal = tf.exp(l_t[:, :num_actions]) rows, diag = [], [] pivot = num_actions for i in range(num_actions): rows.append( tf.pad(l_t[:, pivot:i + pivot], [[0, 0], [0, num_actions - i]])) diag.append( tf.pad(tf.expand_dims(diagonal[:, i], 1), [[0, 0], [i, num_actions - 1 - i]])) pivot += i l_t = tf.transpose(tf.stack(rows), (1, 0, 2)) + tf.transpose( tf.stack(diag), (1, 0, 2)) #print("shape L", tf.stack(rows).shape, diagonal.shape) L = tf.matmul(l_t, tf.transpose(l_t, (0, 2, 1))) u = tf.expand_dims(act_t_ph - mu_t, 1) print("L shape", L.shape, u.shape) a_t = -0.5 * tf.reduce_mean( tf.matmul(tf.matmul(u, L), tf.transpose(u, (0, 2, 1))), 2) q_t = a_t + v_t # q network evaluation #q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") v_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func/v_func") # target q network evalution v_tp1 = v_func(obs_tp1_input.get(), 1, scope="target_v_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_v_func") # q scores for actions which we know were selected in the given state. #q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) q_t_selected = q_t print( "q_shape", tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func/v_func")) # compute estimate of best possible value starting from state at t + 1 #if double_q: # q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) # q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) # q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) #else: # q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * v_tp1 # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(v_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): print("var_target", var_target, var_target.shape, var, var.shape) update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_train_ib(make_obs_ph, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, beta=1.0, theta=1, double_q=True, emdqn=True, vae=True, ib=True, scope="deepq_ib", reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. beta: float coefficient of beta-ib. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_noise = tf.placeholder(tf.float32, [None, 512], name="act_noise") act_f = build_act_ib(make_obs_ph, model_func, act_noise, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") z_noise_t = tf.placeholder(tf.float32, [None, 512], name="z_noise") z_noise_tp1 = tf.placeholder(tf.float32, [None, 512], name="z_noise_tp1") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") inputs = [ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, act_noise, z_noise_t, z_noise_tp1 ] # EMDQN if emdqn or ib: qec_input = tf.placeholder(tf.float32, [None], name='qec') inputs.append(qec_input) if ib or vae: obs_vae_input = U.ensure_tf_input(make_obs_ph("obs_vae")) z_noise_vae = tf.placeholder(tf.float32, [None, 512], name="z_noise_vae") inputs.append(obs_vae_input) inputs.append(z_noise_vae) # q network evaluation q_t, v_mean_t, v_logvar_t, z_mean_t, z_logvar_t, recon_obs_t = model_func( obs_t_input.get(), z_noise_t, num_actions, scope="q_func", reuse=True) if vae or ib: q_vae, v_mean_vae, v_logvar_vae, z_mean_vae, z_logvar_vae, recon_obs = model_func( obs_vae_input.get(), z_noise_vae, num_actions, scope="q_func", reuse=True) # q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1, q_d_tp1, v_mean_tp1, v_logvar_tp1, z_mean_tp1, z_logvar_tp1, recon_obs_tp1 = model_func( obs_tp1_input.get(), z_noise_tp1, num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net, _, _, _, _, _, _ = model_func( obs_tp1_input.get(), z_noise_tp1, num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) td_loss = tf.reduce_mean(importance_weights_ph * U.huber_loss(td_error)) outputs = [td_loss] total_loss = td_loss if vae or ib: encoder_loss = -1 + z_mean_vae**2 + tf.exp( z_logvar_vae) - z_logvar_vae outputs.append(encoder_loss) total_loss += 0.1 * tf.reduce_mean(beta * encoder_loss) if vae: decoder_loss = tf.keras.losses.binary_crossentropy( tf.reshape(recon_obs, [-1]), tf.reshape( tf.dtypes.cast(obs_vae_input._placeholder, tf.float32), [-1])) print("here", z_mean_t.shape, z_logvar_t.shape, encoder_loss.shape, decoder_loss.shape) vae_loss = beta * encoder_loss + theta * decoder_loss outputs.append(decoder_loss) outputs.append(vae_loss) total_loss += 0.1 * tf.reduce_mean(theta * decoder_loss) if ib: ib_loss = (v_mean_t - tf.stop_gradient(tf.expand_dims( qec_input, 1)))**2 / tf.exp(v_logvar_t) + v_logvar_t print("here2", v_mean_t.shape, tf.expand_dims(qec_input, 1).shape, v_logvar_t.shape, ib_loss.shape) total_ib_loss = ib_loss + beta * encoder_loss outputs.append(total_ib_loss) total_loss += 0.1 * tf.reduce_mean(ib_loss) # EMDQN if emdqn: qec_error = q_t_selected - tf.stop_gradient(qec_input) total_loss += 0.1 * tf.reduce_mean( importance_weights_ph * U.huber_loss(qec_error)) outputs.append(qec_error) td_loss_summary = tf.summary.scalar("td loss", td_loss) total_loss_summary = tf.summary.scalar("total loss", total_loss) z_var_summary = tf.summary.scalar("z_var", tf.reduce_mean(tf.exp(z_logvar_t))) summaries = [td_loss_summary, total_loss_summary, z_var_summary] if vae or ib: encoder_loss_summary = tf.summary.scalar( "encoder loss", tf.reduce_mean(encoder_loss)) summaries.append(encoder_loss_summary) if vae: decoder_loss_summary = tf.summary.scalar( "decoder loss", tf.reduce_mean(decoder_loss)) summaries.append(decoder_loss_summary) if ib: ib_loss_summary = tf.summary.scalar("ib loss", tf.reduce_mean(ib_loss)) total_ib_loss_summary = tf.summary.scalar( "total ib loss", tf.reduce_mean(total_ib_loss)) summaries.append(ib_loss_summary) summaries.append(total_ib_loss_summary) if emdqn: qec_loss_summary = tf.summary.scalar( "qec loss", tf.reduce_mean(importance_weights_ph * qec_error)) summaries.append(qec_loss_summary) summary = tf.summary.merge(summaries) outputs.append(summary) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, total_loss, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(total_loss, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=inputs, outputs=[td_error, summary], updates=[optimize_expr]) get_q_t_selected = U.function( inputs=[obs_t_input, act_t_ph, z_noise_t], outputs=q_t_selected) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input, z_noise_t], q_t) return act_f, train, update_target, { 'q_values': q_values }, get_q_t_selected
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): multi_step_num = 3 # multi step return 10, 5 gamma = 0.7 # 折扣率 # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") # 如果要使用长期回报,这里需要一个数组 obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # 创建Q network 与 target Q network , 返回所有action 的q值(q_func()) , q_t是一个列表 # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation ,TD目标 # q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked q_t_selected_target = rew_t_ph + (gamma**multi_step_num) * q_tp1_best_masked # multi step return # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # # start cpu # with tf.device('/cpu:0'): # # compute optimization op (potentially with gradient clipping) # if grad_norm_clipping is not None: # gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) # for i, (grad, var) in enumerate(gradients): # if grad is not None: # gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) # optimize_expr = optimizer.apply_gradients(gradients) # else: # optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # # end cpu # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network # sorted() 不会改变原来的可迭代对象 update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), # 利用q_func_vars.name 进行排序, sorted(target_q_func_vars, key=lambda v: v.name)): # print(var) # 这里var\var_target 就是一个tensor # print(var_target) update_target_expr.append(var_target.assign(var)) # print(update_target_expr) # 大概就是一系列Assign操作 update_target_expr = tf.group(*update_target_expr) # tf.group()将语句变为操作? # 因为是单进程写,多进程读,所以将两种操作分别应用于不同内存区域上,降低lock竞争,仍然有一些不够合理的地方 # 初始化actor的q网络 def init_actor_qfunc(sess, net_list): # 需要tf.variable_scope(scope, reuse=reuse): 因而写在这里 # 或使用tf.get_default_session()(不可用上下文管理器) with sess.as_default(): # net_list_lock.acquire() # 清空list i = len(net_list) while i > 0: i -= 1 del net_list[i] for var_actor in q_func_vars: # 整体顺序是否正确,有待进一步观察 net_list.append(var_actor.eval(session=sess)) # list形式 # for var_actor in q_func_vars: # net_list 长度为q_func_vars两倍 # net_list.append(var_actor.eval(session=sess)) gc.collect() # 释放内存, python3.5 应该不需要 # net_list_lock.release() # 释放锁 len_q_func = len(q_func_vars) # 更新actor的q网络 def update_actor_qfunc(sess, net_list, net_list_lock): with sess.as_default(): net_list_lock.acquire() for i_tensor in range(len_q_func): net_list[i_tensor] = q_func_vars[i_tensor].eval(session=sess) net_list_lock.release() # 释放锁 # 下面三个function分别为整合 train、 update_target 、 q_values # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr] ) # update_target操作没有输入输出 update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, init_actor_qfunc, update_actor_qfunc, {'q_values': q_values}
def build_train(make_obs_ph, q_func, num_actions, optimizer, chief=False, server=None, workers=1, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. chief: bool whether or not the worker should assume chief duties. these include: initializing global parameters, tensorboarding, saving, etc. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ task = server.server_def.task_index act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, task=task) with tf.variable_scope(scope, reuse=reuse): with tf.device("/job:worker/task:{}".format(task)): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # Local timestep counters t = tf.placeholder(tf.float32, [1], name="t") t_global_old = tf.placeholder(tf.float32, [1], name="t_global_old") score_input = tf.placeholder(tf.float32, [1], name="score_input") grad_prio = tf.placeholder(tf.bool, [1], name="grad_prio") converged_ph = tf.placeholder(tf.bool, [1], name="converged") factor_input = tf.placeholder(tf.float32, [1], name="factor_input") # Global timestep counter # TODO Does TF have built-in global step counters? with tf.device("/job:ps/task:0"): t_global = tf.Variable(dtype=tf.float32, initial_value=[0], name="t_global") run_code_global = tf.Variable(initial_value="", name="run_code_global") comm_rounds_global = tf.Variable(dtype=tf.float32, initial_value=[0], name="comm_rounds_global") max_workers_global = tf.constant(workers, dtype=tf.float32, name="max_workers_global") worker_count_global = tf.Variable(dtype=tf.float32, initial_value=[0], name="worker_count_global") score_max_global = tf.Variable(dtype=tf.float32, initial_value=[0], name="score_max_global") score_min_global = tf.Variable(dtype=tf.float32, initial_value=[0], name="score_min_global") submit_count_global = tf.Variable(dtype=tf.float32, initial_value=[-1], name="submit_count_global") converged_global = tf.Variable(dtype=tf.bool, initial_value=[False], name="converged_global") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # global weights print("chief:", chief, "reuse:", True if not chief else None) global_q_func_vars = [] # with tf.device(tf.train.replica_device_setter(cluster=cluster)): with tf.device( "/job:ps/task:0"): # TODO needs RDS if using multiple PS # q_global = q_func(obs_t_input.get(), num_actions, scope="global_weights", reuse=None if chief else True)#reuse=(not chief)) # q_global = q_func(obs_t_input.get(), num_actions, scope="global_weights") with tf.variable_scope("global_weights"): for var in q_func_vars: name = var.name.split(":")[0].split("q_func/")[-1] global_q_func_vars.append( tf.get_variable(name=name, shape=var.shape, dtype=var.dtype, initializer=tf.contrib.layers. xavier_initializer( seed=1, dtype=var.dtype))) # global_q_func_vars = U.scope_vars(U.absolute_scope_name("global_weights")) # print("Global:", global_q_func_vars) # old weights (used to implicitly calculate gradient sum: q_func_vars - q_func_vars_old) q_func_vars_old = [] with tf.variable_scope("old_weights"): for var in q_func_vars: name = var.name.split(":")[0].split("q_func/")[-1] q_func_vars_old.append( tf.get_variable( name=name, shape=var.shape, dtype=var.dtype, initializer=tf.contrib.layers.xavier_initializer( seed=1, dtype=var.dtype))) # q_old = q_func(obs_t_input.get(), num_actions, scope="old_weights") # q_func_vars_old = U.scope_vars(U.absolute_scope_name("old_weights")) # print("Old vars:", q_func_vars_old) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum( q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max( q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip( optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # update_global_fn will be called periodically to copy global Q network to q network update_global_expr = [] for var_global, var, var_old in zip( sorted(global_q_func_vars, key=lambda v: v.name), sorted(q_func_vars, key=lambda v: v.name), sorted(q_func_vars_old, key=lambda v: v.name)): update_global_expr.append(var.assign(var_global)) # TODO Can async cause var <- var_global, var_global <- new value, var_old <- var_global in that order? # TODO Should this copy from var instead? (concurrency issues?) # TODO Can concurrency cause var_old <- var, var <- var_global in that order (resulting in wrong values)? # TODO Safest method is to force sequential execution of var <- var_global, var_old <- var! How though? update_global_expr.append(var_old.assign(var_global)) update_global_expr = tf.group(*update_global_expr) # update the global time step counter by adding the local update_t_global = t_global.assign_add(t) optimize_global_expr = [] # Factor to multiply every gradient with # f = t / (t_global - t_global_old) dt = tf.subtract(update_t_global, t_global_old) factor = tf.where( tf.greater_equal(factor_input, 0), factor_input, tf.where( grad_prio, tf.divide(tf.subtract(score_input, score_min_global), tf.subtract(score_max_global, score_min_global)), tf.div(t, dt))) for var, var_old, var_global in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(q_func_vars_old, key=lambda v: v.name), sorted(global_q_func_vars, key=lambda v: v.name)): # Multiply the difference between the old parameters and the locally optimized parameters # g = (var - var_old) * f grad = tf.multiply(tf.subtract(var, var_old), factor) optimize_global_expr.append(var_global.assign_add(grad)) optimize_global_expr = tf.group(*optimize_global_expr) # if cr == cr_g and wc < wc_max: # wc += 1 # score_global += score # if cr == cr_g and wc == wc_max: # vc += 1 # score_global += score # cr_g += 0.5 # return cr_g """ if cr == cr_g: if wc <= wc_max: wc += 1 score_global += score if wc == wc_max: cr_g += 0.5 return cr_g """ # submit_score_expr = \ # tf.cond(tf.equal(comm_rounds, comm_rounds_global), # lambda: tf.cond(tf.less_equal(worker_count_global, max_workers_global), # lambda: tf.group(worker_count_global.assign_add([1]), # score_global.assign_add(score_input), # tf.cond(tf.equal(worker_count_global, max_workers_global), # lambda: comm_rounds_global.assign_add([0.5]), # lambda: None)), # lambda: tf.group(None, None, None)), # lambda: None) # submit_score_expr = \ # tf.cond(tf.logical_and(tf.equal(comm_rounds, comm_rounds_global), # tf.less(worker_count_global, max_workers_global)), # tf.group(worker_count_global.assign_add(1), # score_global.assign_add(score_input)), # tf.cond(tf.logical_and(tf.equal(comm_rounds, comm_rounds_global), # tf.equal(worker_count_global, max_workers_global)), # tf.group(worker_count_global.assign_add(1), # score_global.assign_add(score_input), # comm_rounds_global.assign_add(0.5)))) # This makes a sum of all scores ( # submit_score_expr = score_global.assign_add(score_input) # This only saves the maximum score (for normalized score weighting) submit_score_max = score_max_global.assign(tf.maximum( score_input, score_max_global), use_locking=True) submit_score_min = score_min_global.assign(tf.minimum( score_input, score_min_global), use_locking=True) set_submit_count = submit_count_global.assign(score_input, use_locking=True) inc_submit_count = submit_count_global.assign_add([1], use_locking=True) # check_round_op = tf.equal(comm_rounds, comm_rounds_global) # Not used anymore inc_wc = worker_count_global.assign_add([1], use_locking=True) zero_wc = worker_count_global.assign([0], use_locking=True) inc_cr = comm_rounds_global.assign_add([1], use_locking=True) score_reset = score_max_global.assign([0], use_locking=True) converged_set = converged_global.assign(converged_ph, use_locking=True) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[td_error], updates=[optimize_expr]) global_opt = U.function( inputs=[t, t_global_old, score_input, factor_input, grad_prio], outputs=[dt, comm_rounds_global, factor], updates=[optimize_global_expr]) # global_sync_opt = U.function(inputs=[comm_rounds], outputs=[comm_rounds_global], updates=[optimize_global_sync_expr]) update_weights = U.function(inputs=[], outputs=[t_global], updates=[update_global_expr]) update_target = U.function([], [], updates=[update_target_expr]) submit_score = U.function( inputs=[score_input], outputs=[comm_rounds_global], updates=[submit_score_max, submit_score_min]) check_round = U.function(inputs=[], outputs=[comm_rounds_global], updates=[]) request_submit = U.function(inputs=[], outputs=[comm_rounds_global, inc_wc], updates=[]) set_submit = U.function(inputs=[score_input], outputs=[set_submit_count], updates=[]) check_submit = U.function(inputs=[], outputs=[submit_count_global], updates=[]) inc_submit = U.function(inputs=[], outputs=[inc_submit_count], updates=[]) inc_comm_round = U.function(inputs=[], outputs=[inc_cr], updates=[]) reset_wc = U.function(inputs=[], outputs=[zero_wc], updates=[]) check_wc = U.function(inputs=[], outputs=[worker_count_global], updates=[]) reset_score = U.function(inputs=[], outputs=[], updates=[score_reset]) set_converged = U.function(inputs=[converged_ph], outputs=[], updates=[converged_set]) check_converged = U.function(inputs=[], outputs=[converged_global], updates=[]) # Debugging functions q_values = U.function([obs_t_input], q_t) weights = U.function( inputs=[], outputs=[q_func_vars, global_q_func_vars, q_func_vars_old], updates=[]) t_global_func = U.function([], t_global) comm_rounds_func = U.function([], comm_rounds_global) return act_f, train, global_opt, update_target, update_weights, \ {'request_submit': request_submit, 'submit_score': submit_score, 'check_round': check_round, 'check_submit': check_submit, 'set_submit': set_submit, 'inc_submit': inc_submit, 'inc_comm_round': inc_comm_round, 'reset_wc': reset_wc, 'check_wc': check_wc, 'reset_score': reset_score, 'set_converged': set_converged, 'check_converged': check_converged}, \ {'q_values': q_values, 'weights': weights, 't_global': t_global_func, 'run_code': run_code_global, 'comm_rounds': comm_rounds_func, 'factor': factor}
def build_train(make_obs_ph, q_func, num_actions, optimizer, bootstrap=False, swarm=False, voting=False, heads=1, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, device="/cpu:0"): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f = build_act(make_obs_ph, q_func, bootstrap=bootstrap, swarm=swarm, voting=voting, heads=heads, num_actions=num_actions, scope=scope, reuse=reuse, device=device) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") update_lr_ph = tf.placeholder(tf.float32, (), name="learning_rate") lr = tf.get_variable("lr", (), initializer=tf.constant_initializer(0)) with tf.device(device): # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True, heads=heads) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func", reuse=True, heads=heads) # reuse parameters form act target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = [] for i in range(heads): q_t_selected.append( tf.reduce_sum(q_t[i] * tf.one_hot(act_t_ph, num_actions), 1)) # compute estimate of best possible value starting from state at t + 1 q_tp1_best = [] q_tp1_best_using_online_net = [] if swarm: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True, heads=heads) action_subsets = [] for i in range(heads): target_greedy_action = tf.argmax(q_tp1[i], axis=1) online_q_value_threshold = tf.reduce_sum( q_tp1_using_online_net[i] * tf.one_hot(target_greedy_action, num_actions), 1) online_q_value_threshold = tf.tile( tf.expand_dims(online_q_value_threshold, 1), tf.constant([1, num_actions])) action_subset = tf.where( (q_tp1_using_online_net[i] - online_q_value_threshold) >= 0, tf.ones([tf.shape(obs_t_input.get())[0], num_actions]), tf.zeros([tf.shape(obs_t_input.get())[0], num_actions])) action_subsets.append(action_subset) action_subsets = tf.stack(action_subsets, axis=1) actions_cover = set_cover(action_subsets) # preferred_actions = tf.transpose(action_subsets, [1, 0, 2]) for i in range(heads): q_tp1_best_using_online_net.append( tf.argmax(tf.multiply(actions_cover, q_tp1[i]), axis=1)) q_tp1_best.append( tf.reduce_sum( q_tp1[i] * tf.one_hot( q_tp1_best_using_online_net[i], num_actions), 1)) elif double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True, heads=heads) for i in range(heads): q_tp1_best_using_online_net.append( tf.arg_max(q_tp1_using_online_net[i], 1)) q_tp1_best.append( tf.reduce_sum( q_tp1[i] * tf.one_hot( q_tp1_best_using_online_net[i], num_actions), 1)) else: for i in range(heads): q_tp1_best.append(tf.reduce_max(q_tp1, 1)) q_tp1_best_masked = [] q_t_selected_target = [] td_error = [] errors = [] weighted_error = [] optimize_expr = [] optimizer = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.99, epsilon=1e-4) q_func_heads = U.scope_vars(U.absolute_scope_name("q_func/heads")) q_func_convnets = U.scope_vars(U.absolute_scope_name("q_func/convnet")) for i in range(heads): q_tp1_best_masked.append((1.0 - done_mask_ph) * q_tp1_best[i]) # compute RHS of bellman equation q_t_selected_target.append(rew_t_ph + gamma * q_tp1_best_masked[i]) # compute the error (potentially clipped) td_error.append(q_t_selected[i] - tf.stop_gradient(q_t_selected_target[i])) with tf.device(device): errors.append(U.huber_loss(td_error[i])) weighted_error.append( tf.reduce_mean(importance_weights_ph * errors[i])) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr.append( U.minimize_and_clip(optimizer, weighted_error[i], var_list=q_func_heads, clip_val=grad_norm_clipping)) optimize_expr.append( U.minimize_and_clip(optimizer, 0.1 * weighted_error[i], var_list=q_func_convnets, clip_val=grad_norm_clipping)) else: optimize_expr.append( optimizer.minimize(weighted_error[i], var_list=q_func_vars)) update_lr_expr = lr.assign( tf.cond(update_lr_ph >= 0, lambda: update_lr_ph, lambda: lr)) optimize_expr.append(update_lr_expr) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, update_lr_ph ], outputs=td_error[0], updates=optimize_expr, ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=False, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None, thompson=True): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) elif thompson: act_f = build_act_with_thompson_sampling(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) eval_act_f = build_act_evaluate_thompson(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t, phi_xt = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from eval act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1, phi_target_xtp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 average_DQN = True double_q = False if double_q: print("building double") q_tp1_using_online_net, _ = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) elif average_DQN: print("building average dqn") k = 2 # we use k-1 for the average dqn - first k-1 for agenet and last k-1 for target prev_target_vars = q_func_vars # we use k-1 for the average dqn - first k-1 for agenet and last k-1 for target update_average_target_expr = [] q_values_ensemble = [] for j in range(k): q_tp1_net_j, _ = q_func(obs_tp1_input.get(), num_actions, scope="target_{}".format(j)) this_target_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_{}".format(j)) q_values_ensemble.append(q_tp1_net_j) update_target_expr_j = [] for var, var_target in zip( sorted(prev_target_vars, key=lambda v: v.name), sorted(this_target_vars, key=lambda v: v.name)): update_target_expr_j.append(var_target.assign(var)) update_target_expr_j = tf.group(*update_target_expr_j) update_target_expr_j_func = U.function( [], [], updates=[update_target_expr_j]) update_average_target_expr.append(update_target_expr_j_func) prev_target_vars = this_target_vars q_tp1_average = tf.reduce_mean(tf.stack(q_values_ensemble[:(k - 1)], axis=-1), axis=-1) q_tp1_best = tf.reduce_max(q_tp1_average, 1) else: print("building not double") q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) if thompson: # Bayes Regression additions last_layer_weights = q_func_vars[-2] #target_q_func_vars[-2] phiphiT_op = tf.matmul(tf.transpose(phi_xt), phi_xt) phiY_op = tf.squeeze( tf.matmul(tf.expand_dims(q_t_selected_target, 0), phi_xt)) YY_op = tf.matmul(tf.expand_dims(q_t_selected_target, 0), tf.expand_dims(q_t_selected_target, -1)) feat_dim = phi_xt.shape[1].value feat = U.function([obs_t_input], phi_xt) feat_target = U.function([obs_tp1_input], phi_target_xtp1) # old q network evalution ensemble = False old_networks = None old_pseudo_counts_f = None outer_product_op = tf.matmul(tf.expand_dims(phi_xt, axis=-1), tf.expand_dims(phi_xt, axis=1)) if ensemble: old_networks = {i: None for i in range(5)} phiphiTs_inv = [] for i in range(5): q_t_old, phi_old = q_func(obs_t_input.get(), num_actions, scope="old_q_func_{}".format(i)) old_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/old_q_func_{}".format(i)) update_old_expr = [] for var, var_old in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(old_q_func_vars, key=lambda v: v.name)): update_old_expr.append(var_old.assign(var)) update_old_expr = tf.group(*update_old_expr) update_old = U.function([], [], updates=[update_old_expr]) feat_old = U.function([obs_t_input], phi_old) phiphiT_inv = tf.placeholder( tf.float32, [None] + [feat_dim, feat_dim], name="phiphiT_inv_{}".format(i)) phiphiTs_inv.append(phiphiT_inv) old_networks[i] = { "phi_old": phi_old, "phiphiT_inv": phiphiT_inv, "features": feat_old, "update": update_old } old_pseudo_counts = [] for i in range(5): old_pseudo_counts.append( tf.reduce_sum(tf.matmul( tf.matmul( tf.expand_dims(old_networks[i]['phi_old'], axis=1), old_networks[i]['phiphiT_inv']), tf.expand_dims(old_networks[i]['phi_old'], axis=-1)), axis=[1, 2])) # debug = tf.stack(old_pseudo_counts) old_pseudo_counts = tf.stack(old_pseudo_counts) old_pseudo_counts_f = U.function([obs_t_input, *phiphiTs_inv], old_pseudo_counts) q_t_old, phi_old = q_func(obs_t_input.get(), num_actions, scope="old_q_func", reuse=True) phiphiT_inv = tf.placeholder(tf.float32, [None] + [feat_dim, feat_dim], name="phiphiT_inv") pseudo_count = tf.reduce_sum(tf.matmul( tf.matmul(tf.expand_dims(phi_old, axis=1), phiphiT_inv), tf.expand_dims(phi_old, axis=-1)), axis=[1, 2]) phiphiTold_op = tf.matmul(tf.transpose(phi_old), phi_old) q_tp1_old, _ = q_func(obs_tp1_input.get(), num_actions, scope="old_target_q_func") if double_q: print("building double target for thompson") q_tp1_using_online_net_old, _ = q_func(obs_tp1_input.get(), num_actions, scope="old_q_func", reuse=True) q_tp1_best_using_online_net_old = tf.argmax( q_tp1_using_online_net_old, 1) q_tp1_best_old = tf.reduce_sum( q_tp1_old * tf.one_hot(q_tp1_best_using_online_net_old, num_actions), 1) elif average_DQN: print("building average target for thompson") q_tp1_average_old = tf.reduce_mean(tf.stack( q_values_ensemble[-(k - 1):], axis=-1), axis=-1) q_tp1_best_old = tf.reduce_max(q_tp1_average_old, 1) else: print("building not double") q_tp1_best_old = tf.reduce_max(q_tp1_old, 1) q_tp1_best_masked_old = (1.0 - done_mask_ph) * q_tp1_best_old q_t_selected_target_old = rew_t_ph + gamma * q_tp1_best_masked_old phiYold_op = tf.squeeze( tf.matmul(tf.expand_dims(q_t_selected_target_old, 0), phi_old)) YYold_op = tf.matmul(tf.expand_dims(q_t_selected_target_old, 0), tf.expand_dims(q_t_selected_target_old, -1)) sdp_ops = U.function( inputs=[obs_t_input, obs_tp1_input, phiphiT_inv], outputs=[pseudo_count, outer_product_op]) # Create callable functions blr_ops = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[phiphiT_op, phiY_op, YY_op]) blr_ops_old = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[phiphiTold_op, phiYold_op, YYold_op]) old_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/old_q_func") update_old_expr = [] for var, var_old in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(old_q_func_vars, key=lambda v: v.name)): update_old_expr.append(var_old.assign(var)) update_old_expr = tf.group(*update_old_expr) update_old = U.function([], [], updates=[update_old_expr]) if not average_DQN: old_target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/old_target_q_func") update_old_target_expr = [] for var, var_old in zip( sorted(target_q_func_vars, key=lambda v: v.name), sorted(old_target_q_func_vars, key=lambda v: v.name)): update_old_target_expr.append(var_old.assign(var)) update_old_target_expr = tf.group(*update_old_target_expr) feat_old = U.function([obs_t_input], phi_old) if average_DQN: update_old_target = update_average_target_expr else: update_old_target = U.function( [], [], updates=[update_old_target_expr]) blr_additions = { 'feat_dim': feat_dim, 'feature_extractor': feat, 'target_feature_extractor': feat_target, 'blr_ops': blr_ops, 'blr_ops_old': blr_ops_old, 'last_layer_weights': last_layer_weights, 'update_old': update_old, 'update_old_target': update_old_target, 'old_feature_extractor': feat_old, 'sdp_ops': sdp_ops, 'old_networks': old_networks, 'eval_act': eval_act_f, 'old_pseudo_counts': old_pseudo_counts_f } else: blr_additions = None return act_f, train, update_target, { 'q_values': q_values }, blr_additions
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def learn( env, test_env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation # CMAES max_fitness, # has to be negative, as cmaes consider minization popsize, gensize, bounds, sigma, eval_iters, max_v_train_iter, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) seed, env_id): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy backup_pi = policy_fn( "backup_pi", ob_space, ac_space ) # Construct a network for every individual to adapt during the es evolution pi_zero = policy_fn( "zero_pi", ob_space, ac_space) # pi_0 will only be updated along with iterations reward = tf.placeholder(dtype=tf.float32, shape=[None]) # step rewards pi_params = tf.placeholder(dtype=tf.float32, shape=[None]) old_pi_params = tf.placeholder(dtype=tf.float32, shape=[None]) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule bound_coeff = tf.placeholder( name='bound_coeff', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") next_ob = U.get_placeholder_cached( name="next_ob") # next step observation for updating q function ac = U.get_placeholder_cached( name="act") # action placeholder for computing q function mean_ac = U.get_placeholder_cached( name="mean_act") # action placeholder for computing q function kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent param_dist = tf.reduce_mean(tf.square(pi_params - old_pi_params)) mean_action_loss = tf.cast( tf.reduce_mean(tf.square(1.0 - pi.pd.mode() / oldpi.pd.mode())), tf.float32) pi_adv = (pi.qpred - pi.vpred) adv_mean, adv_var = tf.nn.moments(pi_adv, axes=[0]) normalized_pi_adv = (pi_adv - adv_mean) / tf.sqrt(adv_var) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) # qf_loss = tf.reduce_mean(tf.square(reward + gamma * pi.mean_qpred - pi.qpred)) qf_loss = tf.reduce_mean( U.huber_loss(reward + gamma * pi.mean_qpred - pi.qpred)) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) qf_losses = [qf_loss] vf_losses = [vf_loss] pol_loss = pol_surr + pol_entpen # pol_loss = pol_surr + pol_entpen # Advantage function should be improved losses = [pol_loss, pol_entpen, meankl, meanent] loss_names = ["pol_surr_2", "pol_entpen", "kl", "ent"] var_list = pi.get_trainable_variables() qf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("qf") ] mean_qf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("meanqf") ] vf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") ] pol_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("pol") ] vf_lossandgrad = U.function([ob, ac, atarg, ret, lrmult], vf_losses + [U.flatgrad(vf_loss, vf_var_list)]) qf_lossandgrad = U.function( [ob, ac, next_ob, mean_ac, lrmult, reward, atarg], qf_losses + [U.flatgrad(qf_loss, qf_var_list)]) qf_adam = MpiAdam(qf_var_list, epsilon=adam_epsilon) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) assign_target_q_eq_eval_q = U.function( [], [], updates=[ tf.assign(target_q, eval_q) for (target_q, eval_q) in zipsame(mean_qf_var_list, qf_var_list) ]) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) assign_backup_eq_new = U.function( [], [], updates=[ tf.assign(backup_v, newv) for ( backup_v, newv) in zipsame(backup_pi.get_variables(), pi.get_variables()) ]) assign_new_eq_backup = U.function( [], [], updates=[ tf.assign(newv, backup_v) for (newv, backup_v ) in zipsame(pi.get_variables(), backup_pi.get_variables()) ]) mean_pi_actions = U.function( [ob], [pi.pd.mode()]) # later for computing pol_loss # Compute all losses compute_pol_losses = U.function([ob, ob, ac, lrmult, atarg], [pol_loss]) U.initialize() get_pi_flat_params = U.GetFlat(pol_var_list) set_pi_flat_params = U.SetFromFlat(pol_var_list) vf_adam.sync() qf_adam.sync() global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness episodes_so_far = 0 timesteps_so_far = 0 ppo_timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards best_fitness = np.inf eval_gen = traj_segment_generator_eval(pi, test_env, timesteps_per_actorbatch, stochastic=True) # For evaluation seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, eval_gen=eval_gen) # For train V Func # Build generator for all solutions actors = [] for i in range(popsize): newActor = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, eval_gen=eval_gen) actors.append(newActor) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if max_timesteps and timesteps_so_far >= max_timesteps: print("Max time steps") break elif max_episodes and episodes_so_far >= max_episodes: print("Max episodes") break elif max_iters and iters_so_far >= max_iters: print("Max iterations") break elif max_seconds and time.time() - tstart >= max_seconds: print("Max time") break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) # Generate new samples # Train V func ob_segs = None for i in range(max_v_train_iter): logger.log("Iteration:" + str(iters_so_far) + " - sub-train iter for V func:" + str(i)) logger.log("Generate New Samples") seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) ob, ac, next_ob, atarg, reward, tdlamret, traj_idx = seg["ob"], seg["ac"], seg["next_ob"], seg["adv"], seg[ "rew"], seg["tdlamret"], \ seg["traj_index"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update( ob) # update running mean/std for normalization assign_old_eq_new( ) # set old parameter values to new parameter values # Train V function logger.log("Training V Func and Evaluating V Func Losses") for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *vf_losses, g = vf_lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) vf_adam.update(g, optim_stepsize * cur_lrmult) losses.append(vf_losses) logger.log(fmt_row(13, np.mean(losses, axis=0))) d_q = Dataset(dict(ob=ob, ac=ac, next_ob=next_ob, reward=reward, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) # Re-train q function logger.log("Training Q Func Evaluating Q Func Losses") for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d_q.iterate_once(optim_batchsize): *qf_losses, g = qf_lossandgrad( batch["ob"], batch["ac"], batch["next_ob"], mean_pi_actions(batch["ob"])[0], cur_lrmult, batch["reward"], batch["atarg"]) qf_adam.update(g, optim_stepsize * cur_lrmult) losses.append(qf_losses) logger.log(fmt_row(13, np.mean(losses, axis=0))) assign_target_q_eq_eval_q() pi0_fitness = compute_pol_losses(ob, ob, mean_pi_actions(ob)[0], cur_lrmult, atarg) logger.log("Best fitness for Pi0:" + str(np.mean(atarg))) logger.log("Best fitness for Pi0:" + str(pi0_fitness)) # CMAES Train Policy assign_old_eq_new() # set old parameter values to new parameter values assign_backup_eq_new() # backup current policy flatten_weights = get_pi_flat_params() opt = cma.CMAOptions() opt['tolfun'] = max_fitness opt['popsize'] = popsize opt['maxiter'] = gensize opt['verb_disp'] = 0 opt['verb_log'] = 0 opt['seed'] = seed opt['AdaptSigma'] = True es = cma.CMAEvolutionStrategy(flatten_weights, sigma, opt) while True: if es.countiter >= gensize: logger.log("Max generations for current layer") break logger.log("Iteration:" + str(iters_so_far) + " - sub-train Generation for Policy:" + str(es.countiter)) logger.log("Sigma=" + str(es.sigma)) solutions = es.ask() costs = [] lens = [] assign_backup_eq_new() # backup current policy for id, solution in enumerate(solutions): set_pi_flat_params(solution) losses = [] # cost = compute_pol_losses(ob_segs['ob'], ob_segs['ob'], mean_pi_actions(ob_segs['ob'])[0]) cost = compute_pol_losses(ob, ob, mean_pi_actions(ob)[0], cur_lrmult, atarg) costs.append(cost[0]) assign_new_eq_backup() # Weights decay l2_decay = compute_weight_decay(0.99, solutions) costs += l2_decay # costs, real_costs = fitness_normalization(costs) costs, real_costs = fitness_rank(costs) es.tell_real_seg(solutions=solutions, function_values=costs, real_f=real_costs, segs=None) logger.log("best_fitness:" + str(best_fitness) + " current best fitness:" + str(es.result[1])) best_solution = es.result[0] best_fitness = es.result[1] logger.log("Best Solution Fitness:" + str(best_fitness)) set_pi_flat_params(best_solution) sigma = es.sigma iters_so_far += 1 episodes_so_far += sum(lens)
def build_train_dueling(make_obs_ph, q_func, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, scope="deepq", input_dim=84 * 84 * 4, hash_dim=32, use_rp=False, imitate=False, reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f = build_act_dueling(make_obs_ph, q_func, model_func, num_actions, input_dim, hash_dim, use_rp, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") if imitate: imitate_act_t_ph = tf.placeholder(tf.float32, [None, num_actions], name="imitate_action") # EMDQN value_t_ph = tf.placeholder(tf.float32, [None], name='value_t') value_tp1_ph = tf.placeholder(tf.float32, [None], name='value_tp1') value_tp1_masked = (1.0 - done_mask_ph) * value_tp1_ph # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act # q_t_normalized = q_t - tf.max(q_t,) q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute RHS of bellman equation q_target = rew_t_ph + gamma * value_tp1_masked # compute the error (potentially clipped) td_error = q_target - (q_t_selected + value_t_ph) td_summary = tf.summary.scalar("td error", tf.reduce_mean(td_error)) # EMDQN print(q_t.shape) if imitate: imitation_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=imitate_act_t_ph, logits=q_t), axis=1) print(imitation_loss.shape) errors = U.huber_loss(td_error) + imitation_loss else: errors = U.huber_loss(td_error) total_summary = tf.summary.scalar("total error", tf.reduce_mean(errors)) value_summary = tf.summary.scalar("value_t", tf.reduce_mean(value_t_ph)) value_tp1_summary = tf.summary.scalar("value_tp1", tf.reduce_mean(value_tp1_ph)) q_summary = tf.summary.scalar("estimated qs", tf.reduce_mean(q_t_selected)) summaries=[td_summary, total_summary, value_summary, value_tp1_summary, q_summary] if imitate: imitate_summary = tf.summary.scalar("imitate loss", tf.reduce_mean(imitation_loss)) summaries.append(imitate_summary) summary = tf.summary.merge(summaries) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network inputs = [ obs_t_input, act_t_ph, rew_t_ph, done_mask_ph, importance_weights_ph, value_t_ph, value_tp1_ph ] if imitate: inputs.append(imitate_act_t_ph) # Create callable functions # EMDQN train = U.function( inputs=inputs, outputs=[td_error, summary], updates=[optimize_expr] ) return act_f, train
def co_build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, using_control_sharing=True): act_f = co_build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, using_control_sharing=using_control_sharing) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_train_imitation(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=False, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array``` optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act_imitation(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # Q(s,a;θi) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # maxQ(s',a';θi-) # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # -!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-! OBSERVER !-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!- # TED's set up placeholders ment_obs_t_input = make_obs_ph("ment_obs_t") ment_act_t_ph = tf.placeholder(tf.int32, [None], name="ment_action") ment_obs_tp1_input = make_obs_ph("ment_obs_tp1") old_error_ph = tf.placeholder(tf.float32, shape=[None], name="old_error") old_imp_weights_ph = tf.placeholder(tf.float32, [None], name="old_imp_weights") # TED's q network evaluation aug_q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act aug_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # TED's target q network evalution aug_q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func", reuse=True) aug_target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # TED's q scores for actions which we know were selected in the given state. aug_q_t_selected = tf.reduce_sum( aug_q_t * tf.one_hot(act_t_ph, num_actions), 1) # Q(s,a;θi) aug_q_tp1_selected = tf.reduce_sum( q_tp1 * tf.one_hot(ment_act_t_ph, num_actions), 1) # Q(s',am;θi) aug_q_tp1_selected_masked = (1.0 - done_mask_ph) * aug_q_tp1_selected # TED's compute estimate of best possible value starting from state at t + 1 if double_q: aug_q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) aug_q_tp1_best_using_online_net = tf.argmax( aug_q_tp1_using_online_net, 1) aug_q_tp1_best = tf.reduce_sum( aug_q_tp1 * tf.one_hot(aug_q_tp1_best_using_online_net, num_actions), 1) else: aug_q_tp1_best = tf.reduce_max(aug_q_tp1, 1) aug_q_tp1_best_masked = ( 1.0 - done_mask_ph) * aug_q_tp1_best # maxQ(s',a';θi-) # TED's compute RHS of bellman equation aug_q_t_selected_target = rew_t_ph + gamma * tf.maximum( aug_q_tp1_best_masked, aug_q_tp1_selected_masked) # aug_q_t_selected_target = rew_t_ph + gamma * aug_q_tp1_best_masked # TED's compute the error (potentially clipped) aug_td_error = aug_q_t_selected - tf.stop_gradient( aug_q_t_selected_target) aug_errors = U.huber_loss(aug_td_error) aug_weighted_error = tf.reduce_mean(importance_weights_ph * aug_errors) # aug_weighted_error = tf.Print(aug_weighted_error, [tf.shape(importance_weights_ph)], "AGENT WEIGHTED ERROR: ") # TED's compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(aug_weighted_error, var_list=aug_q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) aug_optimize_expr = optimizer.apply_gradients(gradients) else: aug_optimize_expr = optimizer.minimize(aug_weighted_error, var_list=aug_q_func_vars) # -!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-! OBSERVER !-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!- # -!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!- MENTOR -!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!- # TED's mentor's q network evaluation ment_q_t = q_func(ment_obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act ment_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # TED's mentor's target q network evalution ment_q_tp1 = q_func(ment_obs_tp1_input.get(), num_actions, scope="target_q_func", reuse=True) ment_target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # TED's mentor's q scores for action am which we know was selected in the given state. ment_q_t_selected = tf.reduce_sum( ment_q_t * tf.one_hot(ment_act_t_ph, num_actions), 1) # Q(sm,am;θi) ment_q_tp1_selected = tf.reduce_sum( ment_q_tp1 * tf.one_hot(ment_act_t_ph, num_actions), 1) # Q(sm',am;θi-) ment_q_tp1_selected_masked = (1.0 - done_mask_ph) * ment_q_tp1_selected # TED's compute estimate of best possible value starting from state at t + 1 if double_q: ment_q_tp1_using_online_net = q_func(ment_obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) ment_q_tp1_best_using_online_net = tf.argmax( ment_q_tp1_using_online_net, 1) ment_q_tp1_best = tf.reduce_sum( ment_q_tp1 * tf.one_hot(ment_q_tp1_best_using_online_net, num_actions), 1) else: ment_q_tp1_best = tf.reduce_max(ment_q_tp1, 1) ment_q_tp1_best_masked = ( 1.0 - done_mask_ph) * ment_q_tp1_best # maxQ(sm',a';θi-) # TED's compute RHS of bellman equation ment_q_t_selected_target = rew_t_ph + gamma * tf.maximum( ment_q_tp1_best_masked, ment_q_tp1_selected_masked) # TED's compute the error (potentially clipped) ment_td_error = ment_q_t_selected - tf.stop_gradient( ment_q_t_selected_target) ment_errors = U.huber_loss(ment_td_error) ment_weighted_error = tf.reduce_mean(importance_weights_ph * ment_errors) # ment_weighted_error = tf.Print(ment_weighted_error, [tf.shape(importance_weights_ph)], "MENTOR WEIGHTED ERROR: ") # TED's compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(ment_weighted_error, var_list=ment_q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) ment_optimize_expr = optimizer.apply_gradients(gradients) else: ment_optimize_expr = optimizer.minimize(ment_weighted_error, var_list=ment_q_func_vars) # -!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!- MENTOR -!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!- def temp_func1(): return aug_td_error, aug_optimize_expr # return td_error, optimize_expr def temp_func2(): return ment_td_error, ment_optimize_expr old_errors = U.huber_loss(old_error_ph) old_weighted_error = tf.reduce_mean(old_imp_weights_ph * old_errors) final_td_error, final_optimize_expr = tf.cond( tf.greater((ment_weighted_error - old_weighted_error)**2, (aug_weighted_error - old_weighted_error)**2), temp_func1, temp_func2) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) # TED's create callable functions trainAugmented = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, ment_obs_t_input, ment_obs_tp1_input, ment_act_t_ph, old_error_ph, old_imp_weights_ph ], outputs=final_td_error, updates=[final_optimize_expr]) return act_f, train, trainAugmented, update_target, { 'q_values': q_values }
def build_train(make_obs_ph, q_func, hr_func, num_actions, rl_optimizer, hr_optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables rl_optimizer: tf.train.Optimizer rl_optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, hr_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, hr_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # feedback placeholders obs_fb = make_obs_ph("obs_fb") act_fb_ph = tf.placeholder(tf.int32, [None], name="action_fb") feedback_ph = tf.placeholder(tf.float32, [None], name="feedback") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = rl_optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = rl_optimizer.apply_gradients(gradients) else: optimize_expr = rl_optimizer.minimize(weighted_error, var_list=q_func_vars) # update feedback function approximator (HR) batch_size = tf.shape(obs_fb.get())[0] pred_feedbacks = hr_func(obs_fb.get(), num_actions, scope="hr_func", reuse=True) indices = tf.stack([tf.range(batch_size), act_fb_ph], axis=-1) pred_feedbacks = tf.gather_nd(pred_feedbacks, indices) feedback_loss = tf.reduce_mean( -(feedback_ph * tf.log(pred_feedbacks) + (1 - feedback_ph) * tf.log(1 - pred_feedbacks))) fb_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/hr_func") feedback_train_op = hr_optimizer.minimize(feedback_loss, var_list=fb_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train_rl = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) train_hr = U.function(inputs=[obs_fb, act_fb_ph, feedback_ph], outputs=[pred_feedbacks, feedback_loss], updates=[feedback_train_op]) _evaluate_hr = U.function(inputs=[obs_fb, act_fb_ph, feedback_ph], outputs=[pred_feedbacks, feedback_loss], updates=[]) def evaluate_hr(obs, actions, feedbacks): assert len(obs) == len(actions) == len(feedbacks) total_acc = [] total_loss = [] fb_batch_size = 5 for i in range(0, len(obs) - fb_batch_size, fb_batch_size): obs_batch = obs[i:i + fb_batch_size] action_batch = actions[i:i + fb_batch_size] feedback_batch = feedbacks[i:i + fb_batch_size] pred, loss = _evaluate_hr(obs_batch, action_batch, feedback_batch) acc = (np.round(pred) == feedback_batch).mean() total_acc.append(acc) total_loss.append(loss) return np.mean(total_acc), np.mean(total_loss) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train_rl, train_hr, evaluate_hr, update_target, { 'q_values': q_values }
def build_train_mer(input_type, obs_shape, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, scope="mfec", num_neg=10, latent_dim=32, alpha=0.1, beta=1e2, theta=10, loss_type=["contrast"], knn=4, c_loss_type="margin", b=100, batch_size=32, reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if c_loss_type != "infonce": assert num_neg == 1 # z_func = build_act_contrast(make_obs_ph, model_func, num_actions, scope=scope, secondary_scope="model_func", # reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders # EMDQN # tau = tf.placeholder(tf.float32, [1], name='tau') # momentum = tf.placeholder(tf.float32, [1], name='momentum') # make_obs_ph = lambda name: input_type(obs_shape, batch_size, name=name), magic_num = tf.get_variable(name='magic', shape=[1]) obs_input_query = U.ensure_tf_input( input_type(obs_shape, None, name="obs_query")) obs_input_positive = U.ensure_tf_input( input_type(obs_shape, batch_size, name="enc_obs_pos")) obs_input_negative = U.ensure_tf_input( input_type(obs_shape, batch_size * num_neg, name="enc_obs_neg")) obs_input_neighbour = U.ensure_tf_input( input_type(obs_shape, batch_size * knn, name="enc_obs_neighbour")) obs_input_uniformity_u = U.ensure_tf_input( input_type(obs_shape, batch_size, name="enc_obs_uni_u")) obs_input_uniformity_v = U.ensure_tf_input( input_type(obs_shape, batch_size, name="enc_obs_uni_v")) obs_input_weighted_product_u = U.ensure_tf_input( input_type(obs_shape, batch_size, name="enc_obs_wp_u")) obs_input_weighted_product_v = U.ensure_tf_input( input_type(obs_shape, batch_size, name="enc_obs_wp_v")) value_input_weighted_product_u = tf.placeholder(tf.float32, [batch_size], name="value_u") value_input_weighted_product_v = tf.placeholder(tf.float32, [batch_size], name="value_v") value_input_query = tf.placeholder(tf.float32, [batch_size], name="value") value_input_neighbour = tf.placeholder(tf.float32, [batch_size, knn], name="neighbour_value") action_embedding = tf.Variable(tf.random_normal( [num_actions, latent_dim], stddev=1), name="action_embedding") action_input = tf.placeholder(tf.int32, [batch_size], name="action") action_input_causal = tf.placeholder(tf.int32, [batch_size], name="action") reward_input_causal = tf.placeholder(tf.float32, [batch_size], name="action") inputs = [obs_input_query] if "contrast" in loss_type: inputs += [obs_input_positive, obs_input_negative] if "regression" in loss_type: inputs += [value_input_query] if "linear_model" in loss_type: inputs += [action_input] if "contrast" not in loss_type: inputs += [obs_input_positive] if "fit" in loss_type: # if "contrast" not in loss_type: # inputs+=[] inputs += [obs_input_neighbour, value_input_neighbour] if "regression" not in loss_type: inputs += [value_input_query] if "weight_product" in loss_type: inputs += [ obs_input_uniformity_u, obs_input_uniformity_v, obs_input_weighted_product_u, obs_input_weighted_product_v, value_input_weighted_product_u, value_input_weighted_product_v ] if "causality" in loss_type: inputs += [reward_input_causal, action_input_causal] z_old = model_func(obs_input_query.get(), num_actions, scope="target_model_func", reuse=False) z = model_func(obs_input_query.get(), num_actions, scope="model_func", reuse=tf.AUTO_REUSE) z_pos = model_func(obs_input_positive.get(), num_actions, scope="model_func", reuse=True) z_neg = model_func(obs_input_negative.get(), num_actions, scope="model_func", reuse=True) z_uni_u = model_func(obs_input_uniformity_u.get(), num_actions, scope="model_func", reuse=True) z_uni_v = model_func(obs_input_uniformity_v.get(), num_actions, scope="model_func", reuse=True) z_wp_u = model_func(obs_input_weighted_product_u.get(), num_actions, scope="model_func", reuse=True) z_wp_v = model_func(obs_input_weighted_product_v.get(), num_actions, scope="model_func", reuse=True) z_pos = tf.reshape(z_pos, [-1, latent_dim]) z_tar = tf.reshape(z, [-1, latent_dim]) if "contrast" in loss_type: z_neg = tf.reshape(z_neg, [-1, latent_dim]) contrast_loss, contrast_summary = contrastive_loss_fc( z_tar, z_pos, z_neg, c_type=c_loss_type, num_neg=num_neg, batch_size=batch_size, emb_dim=latent_dim) symmetry_loss, symmetry_summary = contrastive_loss_fc( z_pos, z_tar, z_neg, c_type=c_loss_type, num_neg=num_neg, batch_size=batch_size, emb_dim=latent_dim) contrast_loss += symmetry_loss z_neighbour = model_func(obs_input_neighbour.get(), num_actions, scope="model_func", reuse=True) # fit loss z_neighbour = tf.reshape(z_neighbour, [-1, knn, latent_dim]) square_dist = tf.square( tf.tile(tf.expand_dims(z_tar, 1), [1, knn, 1]) - z_neighbour) neighbour_dist = tf.reduce_sum(square_dist, axis=2) neighbour_coeff = tf.math.softmax(-neighbour_dist / b, axis=1) coeff_sum = tf.reduce_mean(tf.reduce_sum(neighbour_coeff, axis=1)) value_input_neighbour_mean = tf.reduce_mean(value_input_neighbour) fit_value = tf.reduce_sum(tf.multiply(neighbour_coeff, value_input_neighbour), axis=1) fit_loss = tf.reduce_mean(tf.abs(fit_value - value_input_query)) # causality loss reward_input_causal = tf.reshape(reward_input_causal, [1, -1]) reward_tile = tf.tile(reward_input_causal, [batch_size, 1]) # reward_mask = (reward_tile - tf.transpose(reward_tile)) ** 2 reward_mask = 1 - tf.cast( tf.equal((reward_tile - tf.transpose(reward_tile)), tf.constant(0.)), tf.float32) action_input_causal = tf.reshape(action_input_causal, [1, -1]) action_tile = tf.tile(action_input_causal, [batch_size, 1]) action_mask = tf.cast( tf.equal((action_tile - tf.transpose(action_tile)), tf.constant(0)), tf.float32) total_mask = tf.multiply(reward_mask, action_mask) z_tile = tf.tile(tf.expand_dims(z_tar, 1), [1, batch_size, 1]) z_diff = z_tile - tf.transpose(z_tile, perm=[1, 0, 2]) distance = tf.reduce_sum(z_diff**2, axis=2) exp_distance = tf.exp(-distance) causal_find_rate = (tf.reduce_sum(total_mask)) / (batch_size**2 - batch_size) causal_loss = tf.reduce_sum(tf.multiply(exp_distance, total_mask)) # regularization loss regularization_loss = -tf.maximum( 1., tf.reduce_mean(U.huber_loss(z_tar, 0.01))) regression_loss = tf.reduce_mean( tf.squared_difference(tf.norm(z_tar, axis=1), alpha * value_input_query)) + regularization_loss # linear model loss action_embeded = tf.matmul(tf.one_hot(action_input, num_actions), action_embedding) model_loss = tf.reduce_mean( tf.squared_difference(action_embeded + z_tar, z_pos)) + 0.01 * regularization_loss # weighted product loss uniformity_loss = tf.reduce_sum( tf.exp(2 * tf.reduce_sum(tf.multiply(z_uni_u, z_uni_v), axis=1) - 2)) value_weight = (value_input_weighted_product_u - value_input_weighted_product_v)**2 # angle = acos_safe(tf.reduce_sum(tf.multiply(z_wp_u, z_wp_v), axis=1)) angle = tf.reduce_sum(tf.multiply(z_wp_u, z_wp_v), axis=1) weighted_product = tf.multiply(value_weight, angle) wp_loss = tf.reduce_sum(weighted_product) total_loss = 0 if "contrast" in loss_type: total_loss += contrast_loss if "regression" in loss_type: total_loss += beta * regression_loss if "linear_model" in loss_type: total_loss += theta * model_loss if "fit" in loss_type: total_loss += beta * fit_loss if "causality" in loss_type: total_loss += theta * causal_loss if "weight_product" in loss_type: total_loss += 0.1 * uniformity_loss total_loss += wp_loss model_func_vars = U.scope_vars(U.absolute_scope_name("model_func")) model_func_vars_update = copy.copy(model_func_vars) if "linear_model" in loss_type: model_func_vars_update.append(action_embedding) target_model_func_vars = U.scope_vars( U.absolute_scope_name("target_model_func")) update_target_expr = [] for var in model_func_vars: print(var.name, var.shape) for var_target in target_model_func_vars: print(var_target.name, var_target.shape) for var, var_target in zip( sorted(model_func_vars, key=lambda v: v.name), sorted(target_model_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip( optimizer, total_loss, var_list=model_func_vars_update, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(total_loss, var_list=model_func_vars_update) # Create callable functions # update_target_fn will be called periodically to copy Q network to target Q network z_var_summary = tf.summary.scalar( "z_var", tf.reduce_mean(tf.math.reduce_std(z, axis=1))) if "contrast" in loss_type: z_neg = tf.reshape(z_neg, [batch_size, num_neg, latent_dim]) negative_summary = tf.summary.scalar( "negative_dist", tf.reduce_mean(emb_dist(z_tar, z_neg[:, 0, :]))) positive_summary = tf.summary.scalar( "positive_dist", tf.reduce_mean(emb_dist(z_tar, z_pos))) if "contrast" in loss_type: contrast_loss_summary = tf.summary.scalar( "contrast loss", tf.reduce_mean(contrast_loss)) regularization_loss_summary = tf.summary.scalar( "regularization loss", tf.reduce_mean(regularization_loss)) regression_loss_summary = tf.summary.scalar( "regression loss", tf.reduce_mean(regression_loss)) model_loss_summary = tf.summary.scalar("model loss", tf.reduce_mean(model_loss)) fit_loss_summary = tf.summary.scalar("fit loss", tf.reduce_mean(fit_loss)) fit_value_summary = tf.summary.scalar("fit value", tf.reduce_mean(fit_value)) neighbour_value_summary = tf.summary.scalar( "neighbour value", value_input_neighbour_mean) coeff_summary = tf.summary.scalar("coeff sum", coeff_sum) square_dist_summary = tf.summary.scalar("square_dist", tf.reduce_mean(square_dist)) z_neighbour_summary = tf.summary.scalar("z_neighbour_mean", tf.reduce_mean(z_neighbour)) # fit_loss_summary = tf.summary.scalar("fit loss", tf.reduce_mean(fit_loss)) # prediction_loss_summary = tf.summary.scalar("prediction loss", tf.reduce_mean(prediction_loss)) causal_efficiency_summary = tf.summary.scalar("causal efficiency", causal_find_rate) causal_loss_summary = tf.summary.scalar("causal loss", causal_loss) # reward_mask_summary = tf.summary.scalar("reward mask summary", debug_reward_mask) # action_mask_summary = tf.summary.scalar("action mask summary", debug_action_mask) uniformity_loss_summary = tf.summary.scalar("uniform loss", uniformity_loss) wp_loss_summary = tf.summary.scalar("weighted product loss", wp_loss) total_loss_summary = tf.summary.scalar("total loss", tf.reduce_mean(total_loss)) summaries = [ z_var_summary, total_loss_summary, regularization_loss_summary ] if "contrast" in loss_type: summaries += [ negative_summary, positive_summary, contrast_loss_summary ] summaries += contrast_summary if "regression" in loss_type: summaries.append(regression_loss_summary) if "linear_model" in loss_type: summaries.append(model_loss_summary) if "contrast" not in loss_type: summaries.append(positive_summary) if "fit" in loss_type: summaries.append(fit_loss_summary) summaries.append(fit_value_summary) summaries.append(neighbour_value_summary) summaries.append(coeff_summary) summaries.append(square_dist_summary) summaries.append(z_neighbour_summary) if "causality" in loss_type: summaries.append(causal_efficiency_summary) summaries.append(causal_loss_summary) # summaries.append(reward_mask_summary) # summaries.append(action_mask_summary) if "weight_product" in loss_type: summaries.append(uniformity_loss_summary) summaries.append(wp_loss_summary) summary = tf.summary.merge(summaries) outputs = [total_loss, summary] train = U.function(inputs=inputs, outputs=outputs, updates=[optimize_expr]) eval = U.function(inputs=inputs, outputs=outputs, updates=[]) z_func = U.function( inputs=[obs_input_query], outputs=[z_old], ) norm_func = U.function(inputs=[obs_input_query], outputs=[tf.norm(z_tar, axis=1)]) update_target_func = U.function([], [], updates=[update_target_expr]) return z_func, train, eval, norm_func, update_target_func
def loss_q_prioritize(self, states, q_target, actions, coef_q, weights): q_values= self.q_estimation(states) q_values = tf.reduce_sum(tf.multiply(q_values,actions),axis = 1) loss = coef_q * tf.reduce_mean(weights * U.huber_loss((q_values - q_target))) return loss
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip graident norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None, distributed=False, v_min=-10.0, v_max=10.0, atoms=51): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. distributed: bool whether or not distributed version is enabled. v_min: float lower boundary for value, only works when distributed version is enabled. v_max: float upper boundary for value, only works when distributed version is enabled. atoms: int number of atoms, only works when distributed version is enabled. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ print("build train use distributed? ", distributed) if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func, distributed=distributed, v_min=v_min, v_max=v_max, atoms=atoms) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, distributed=distributed, v_min=v_min, v_max=v_max, atoms=atoms) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") distributed_target_ph = tf.placeholder(tf.float32, [None, atoms], name="dis_target") # q network evaluation if not distributed: q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") else: q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. if not distributed: q_t_selected = tf.reduce_sum( q_t * tf.one_hot(act_t_ph, num_actions), 1) else: probability_qt = tf.nn.softmax(q_t) q_t_selected = tf.reduce_sum( q_t * tf.tile(tf.expand_dims(tf.one_hot(act_t_ph, num_actions), 2), [1, 1, atoms]), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: print("use double") if not distributed: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max( q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best = get_distibute_q(q_tp1_using_online_net, v_min, v_max, atoms, obs_tp1_input) a_tp1_best = tf.argmax(q_tp1_best, 1) probability_qt1 = tf.nn.softmax(q_tp1_using_online_net) q_tp1_best = tf.reduce_sum( probability_qt1 * tf.tile( tf.expand_dims(tf.one_hot(a_tp1_best, num_actions), 2), [1, 1, atoms]), 1) else: print("not use double") if not distributed: q_tp1_best = tf.reduce_max(q_tp1, 1) else: if distributed: q_tp1_best = get_distibute_q(q_tp1, v_min, v_max, atoms, obs_tp1_input) a_tp1_best = tf.argmax(q_tp1_best, 1) probability_qt1 = tf.nn.softmax(q_tp1) q_tp1_best = tf.reduce_sum( probability_qt1 * tf.tile( tf.expand_dims(tf.one_hot(a_tp1_best, num_actions), 2), [1, 1, atoms]), 1) mask = 1.0 - done_mask_ph if not distributed: q_tp1_best_masked = mask * q_tp1_best else: q_tp1_best_masked = q_tp1_best # compute RHS of bellman equation if not distributed: q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) else: clip_target = tf.clip_by_value(distributed_target_ph, 1e-8, 1.0) clip_select = tf.clip_by_value(tf.nn.softmax(q_t_selected), 1e-8, 1.0) # use kl divergence td_error = tf.reduce_sum( clip_target * (tf.log(clip_target) - tf.log(clip_select)), axis=-1) errors = tf.nn.softmax_cross_entropy_with_logits( labels=distributed_target_ph, logits=q_t_selected) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions if distributed: train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, distributed_target_ph ], outputs=td_error, updates=[optimize_expr]) else: train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, ], outputs=td_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) q_tp1_best_final = U.function([obs_tp1_input], q_tp1_best) return act_f, train, update_target, { 'q_values': q_values, 'q_t1_best': q_tp1_best_final }
def build_train(make_obs_ph, q_func, num_actions, grad_norm_clipping=None, gamma=1.0, deterministic_filter=False, random_filter=False, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func, deterministic_filter=deterministic_filter, random_filter=random_filter) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, deterministic_filter=deterministic_filter, random_filter=random_filter) with tf.variable_scope(scope, reuse=reuse): # set up placeholders lr_ph = tf.placeholder(tf.float32, name="lr") obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(U.data_type, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(U.data_type, [None], name="done") importance_weights_ph = tf.placeholder(U.data_type, [None], name="weight") board_size = obs_t_input.get().get_shape().as_list()[1] obs_t = transform_obses(obs_t_input.get()) obs_tp1 = transform_obses(obs_tp1_input.get()) act_t = transform_actions(act_t_ph, board_size) if deterministic_filter: invalid_masks_tp1 = build_invalid_masks(obs_tp1) # q network evaluation q_t = q_func(obs_t, num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1, num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum( q_t * tf.one_hot(act_t, num_actions, dtype=U.data_type), axis=1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1, num_actions, scope="q_func", reuse=True) if deterministic_filter: q_tp1_using_online_net = build_q_filter( q_tp1_using_online_net, invalid_masks_tp1) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1, output_type=U.index_type) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions, dtype=U.data_type), 1) else: if deterministic_filter: q_tp1 = build_q_filter(q_tp1, invalid_masks_tp1) q_tp1_best = tf.reduce_max(q_tp1, axis=1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) weighted_error = tf.reduce_mean(importance_weights_ph * U.huber_loss(td_error)) regularizer = tf.add_n([tf.nn.l2_loss(var) for var in q_func_vars]) * 0.0001 total_error = weighted_error + regularizer # optimizer = tf.train.MomentumOptimizer( # learning_rate=lr_ph, momentum=0.9) optimizer = tf.train.AdamOptimizer(learning_rate=lr_ph) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, total_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(total_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ lr_ph, obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[td_error, weighted_error, total_error], updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def __init__(self, obs_dim, n_acts, seed, lr, gamma, double_q=True, grad_val_clipping=None, grad_norm_clipping=None): # grad_val_clipping=0.5, # grad_norm_clipping=5.0): sess = U.get_session() self.sess = sess set_global_seeds(seed) # create placeholders for the input data for the current and next timesteps cur_input = create_input_placeholders(obs_dim, 'cur_input') next_input = create_input_placeholders(obs_dim, 'next_input') # create placeholders for the output data for the current timestep cur_output = create_output_placeholders(n_acts, 'cur_out') # calculate the q value for the chosen action q_vals_main_cur = get_model(cur_input['obs_ph'], n_acts, 'main') q_a = tf.reduce_max(tf.cast(cur_output['act_ph'], dtype=tf.float32) * q_vals_main_cur, axis=-1) # calculate the q value for the target network q_vals_target_next = get_model(next_input['obs_ph'], n_acts, 'target') if double_q: q_vals_main_next = get_model(next_input['obs_ph'], n_acts, 'main') next_act_main = tf.argmax(q_vals_main_next, axis=-1) q_vals_target_next_best = tf.reduce_max( q_vals_target_next * tf.one_hot(next_act_main, n_acts), axis=-1) else: q_vals_target_next_best = tf.reduce_max(q_vals_target_next, axis=-1) done_mask = 1 - cur_output['done_ph'] q_target = done_mask * gamma * q_vals_target_next_best q_target = cur_output['rew_ph'] + q_target # create the loss function td_error = q_a - tf.stop_gradient(q_target) adjusted_square_error = U.huber_loss(td_error) loss = tf.reduce_mean(adjusted_square_error) # make target update operation main_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='main') target_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target') assign_ops_target = [ tf.assign(target_var, main_var) for target_var, main_var in zip(target_vars, main_vars) ] target_update_op = tf.group(*assign_ops_target) def update_target(): sess.run(target_update_op) # make train function optimizer = tf.train.AdamOptimizer(learning_rate=lr) gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients = list(gradients) # if grad_val_clipping: # for i, grad in enumerate(gradients): # if grad is not None: # gradients[i] = tf.clip_by_value(grad, -grad_val_clipping, grad_val_clipping) # if grad_norm_clipping: # gradients, global_norm = tf.clip_by_global_norm(gradients, grad_norm_clipping) train_op = optimizer.apply_gradients(zip(gradients, variables)) def train(batch): feed_dict = { cur_input['obs_ph']: batch['cur_obs'], next_input['obs_ph']: batch['next_obs'], cur_output['act_ph']: batch['acts'], cur_output['rew_ph']: batch['rews'], cur_output['done_ph']: batch['done'], } sess.run(train_op, feed_dict=feed_dict) self.train = train self.update_target = update_target self.save = functools.partial(U.save_variables, sess=sess) self.load = functools.partial(U.load_variables, sess=sess) print("Initialized Model")