def test_MpiAdam(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2, 5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) do_update = U.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) for i in range(10): print(i, do_update()) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a, b] lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) adam = MpiAdam(var_list) for i in range(10): l, g = lossandgrad() adam.update(g, stepsize) print(i, l)
def validate_probtype(probtype, pdparam): N = 100000 # Check to see if mean negative log likelihood == differential entropy Mval = np.repeat(pdparam[None, :], N, axis=0) M = probtype.param_placeholder([N]) X = probtype.sample_placeholder([N]) pd = probtype.pdfromflat(M) calcloglik = U.function([X, M], pd.logp(X)) calcent = U.function([M], pd.entropy()) Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval}) logliks = calcloglik(Xval, Mval) entval_ll = - logliks.mean() #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval = calcent(Mval).mean() #pylint: disable=E1101 assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] M2 = probtype.param_placeholder([N]) pd2 = probtype.pdfromflat(M2) q = pdparam + np.random.randn(pdparam.size) * 0.1 Mval2 = np.repeat(q[None, :], N, axis=0) calckl = U.function([M, M2], pd.kl(pd2)) klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 logliks = calcloglik(Xval, Mval2) klval_ll = - entval - logliks.mean() #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas print('ok on', probtype, pdparam)
def build_act_greedy(make_obs_ph, q_func, num_actions, scope="deepq", reuse=True, eps=0.0): """Creates the act function for a simple fixed epsilon greedy Added by HJ """ with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") q_values = q_func(observations_ph.get(), num_actions, scope="q_func") deterministic_actions = tf.argmax(q_values[:, :num_actions], axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) _act = U.function(inputs=[observations_ph, stochastic_ph], outputs=output_actions) def act(ob, stochastic=True): return _act(ob, stochastic) return act
def build_act(make_obs_ph, q_func, num_actions, scope="setdeepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ with tf.compat.v1.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.compat.v1.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.compat.v1.placeholder(tf.float32, (), name="update_eps") eps = tf.compat.v1.get_variable("eps", (), initializer=tf.compat.v1.constant_initializer(0)) # Clipped Double q q1_values = q_func(observations_ph.get(), num_actions, scope="q1_func", reuse=reuse) q2_values = q_func(observations_ph.get(), num_actions, scope="q2_func", reuse=reuse) # Sum over q1 and q2 and find the action with argmax deterministic_actions = tf.argmax(input=q1_values+q2_values, axis=1) batch_size = tf.shape(input=observations_ph.get())[0] random_actions = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.compat.v1.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(pred=stochastic_ph, true_fn=lambda: stochastic_actions, false_fn=lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(pred=update_eps_ph >= 0, true_fn=lambda: update_eps_ph, false_fn=lambda: eps)) _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) def act(ob, stochastic=True, update_eps=-1): return _act(ob, stochastic, update_eps) return act
def test_function(): with tf.Graph().as_default(): x = tf.compat.v1.placeholder(tf.int32, (), name="x") y = tf.compat.v1.placeholder(tf.int32, (), name="y") z = 3 * x + 2 * y lin = function([x, y], z, givens={y: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10
def test_multikwargs(): with tf.Graph().as_default(): x = tf.compat.v1.placeholder(tf.int32, (), name="x") with tf.compat.v1.variable_scope("other"): x2 = tf.compat.v1.placeholder(tf.int32, (), name="x") z = 3 * x + 2 * x2 lin = function([x, x2], z, givens={x2: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10
def build_act_bayesian(make_obs_ph, q_func, num_actions, scope="deepadfq", reuse=None): """Creates the act function for Bayesian sampling """ with tf.compat.v1.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") q_values = q_func(observations_ph.get(), num_actions*2, scope="q_func") # mean and -log(sd) q_means = q_values[:,:num_actions] q_sds = tf.math.exp(-q_values[:,num_actions:]) samples = tf.random.normal((),mean=q_means,stddev=q_sds) output_actions = tf.argmax(input=samples, axis=1) _act = U.function(inputs=[observations_ph], outputs=output_actions ) def act(ob, stochastic=True, update_eps=-1): return _act(ob) return act
def build_act_greedy(make_obs_ph, q_func, num_actions, scope="setdeepq", reuse=True, eps=0.0): """Creates the act function for a simple fixed epsilon greedy Added by HJ """ with tf.compat.v1.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.compat.v1.placeholder(tf.bool, (), name="stochastic") # Clipped Double q q1_values = q_func.forward(observations_ph.get(), num_actions, scope="q1_func", reuse=reuse) q2_values = q_func.forward(observations_ph.get(), num_actions, scope="q2_func", reuse=reuse) # Sum over q1 and q2 and find the action with argmax deterministic_actions = tf.argmax(input=q1_values + q2_values, axis=1) batch_size = tf.shape(input=observations_ph.get())[0] random_actions = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random.uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.compat.v1.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(pred=stochastic_ph, true_fn=lambda: stochastic_actions, false_fn=lambda: deterministic_actions) _act = U.function(inputs=[observations_ph, stochastic_ph], outputs=output_actions) def act(ob, stochastic=True): return _act(ob, stochastic) return act
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.compat.v1.get_variable( dtype=tf.float64, shape=shape, initializer=tf.compat.v1.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.compat.v1.get_variable( dtype=tf.float64, shape=shape, initializer=tf.compat.v1.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.compat.v1.get_variable( dtype=tf.float64, shape=(), initializer=tf.compat.v1.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.cast(self._sum / self._count, dtype=tf.float32) self.std = tf.sqrt( tf.maximum( tf.cast(self._sumsq / self._count, dtype=tf.float32) - tf.square(self.mean), 1e-2)) newsum = tf.compat.v1.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.compat.v1.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.compat.v1.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function( [newsum, newsumsq, newcount], [], updates=[ tf.compat.v1.assign_add(self._sum, newsum), tf.compat.v1.assign_add(self._sumsq, newsumsq), tf.compat.v1.assign_add(self._count, newcount) ])
def build_train(make_obs_ph, model, num_actions, optimizer_f, grad_norm_clipping=None, gamma=1.0, double_q=False, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None, test_eps=0.05, learning_rate = 0.001, learning_rate_decay_factor=0.99, learning_rate_growth_factor=1.001): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise(make_obs_ph, model, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, model, num_actions, scope=scope, reuse=reuse) act_greedy = build_act_greedy(make_obs_ph, model, num_actions, scope=scope, reuse=True, eps=test_eps) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # Learning rate adjustment lr = tf.Variable(float(learning_rate), trainable=False, dtype = tf.float32) learning_rate_decay_op = lr.assign(tf.clip_by_value(lr*learning_rate_decay_factor, 1e-5, 1e-3)) learning_rate_growth_op = lr.assign(tf.clip_by_value(lr*learning_rate_growth_factor, 1e-5, 1e-3)) optimizer = optimizer_f(learning_rate = lr) # q network evaluation atom_t = model(obs_t_input.get(), num_outputs, scope="atom_func", reuse=True) # reuse parameters from act atom_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/atom_func") atom_p_t = tf.nn.softmax(atom_t) # target q network evalution atom_tp1 = model(obs_tp1_input.get(), num_outputs, scope="target_atom_func") target_atom_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_atom_func") atom_p_tp1 = tf.nn.softmax(atom_tp1) m_vec = tf.constant(0.0, dtype=tf.float32, shape=(num_atoms)) for j in range(num_atoms): Tz_j = tf.clip(rew_t_ph + gamma * (V_min + j * del_z), V_min, V_max) b_j = (Tz_j - V_min)/del_z l = tf.astype(tf.math.floor(b_j), tf.int32) u = tf.astype(tf.math.ceil(b_j), tf.int32) m_vec[l] = m_vec[l] + cem_loss = tf.reduce_sum(tf.math.multiply(m, tf.log(atom_p))) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[td_error, lr], updates=[optimize_expr] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function(inputs=[obs_t_input], outputs=q_t) return act_f, act_greedy, q_values, train, update_target, learning_rate_decay_op, learning_rate_growth_op, {'q_values': q_values}
def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None): """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905): Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ if param_noise_filter_func is None: param_noise_filter_func = default_param_noise_filter with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold") update_param_noise_scale_ph = tf.placeholder(tf.bool, (), name="update_param_noise_scale") reset_ph = tf.placeholder(tf.bool, (), name="reset") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01), trainable=False) param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05), trainable=False) # Unmodified Q. q_values = q_func(observations_ph.get(), num_actions, scope="q_func") # Perturbable Q used for the actual rollout. q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func") # We have to wrap this code into a function due to the way tf.cond() works. See # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for # a more detailed discussion. def perturb_vars(original_scope, perturbed_scope): all_vars = scope_vars(absolute_scope_name(original_scope)) all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope)) assert len(all_vars) == len(all_perturbed_vars) perturb_ops = [] for var, perturbed_var in zip(all_vars, all_perturbed_vars): if param_noise_filter_func(perturbed_var): # Perturb this variable. op = tf.assign(perturbed_var, var + tf.random_normal(shape=tf.shape(var), mean=0., stddev=param_noise_scale)) else: # Do not perturb, just assign. op = tf.assign(perturbed_var, var) perturb_ops.append(op) assert len(perturb_ops) == len(all_vars) return tf.group(*perturb_ops) # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy # of the network and measures the effect of that perturbation in action space. If the perturbation # is too big, reduce scale of perturbation, otherwise increase. q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func") perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func") kl = tf.reduce_sum(tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), axis=-1) mean_kl = tf.reduce_mean(kl) def update_scale(): with tf.control_dependencies([perturb_for_adaption]): update_scale_expr = tf.cond(mean_kl < param_noise_threshold, lambda: param_noise_scale.assign(param_noise_scale * 1.01), lambda: param_noise_scale.assign(param_noise_scale / 1.01), ) return update_scale_expr # Functionality to update the threshold for parameter space noise. update_param_noise_threshold_expr = param_noise_threshold.assign(tf.cond(update_param_noise_threshold_ph >= 0, lambda: update_param_noise_threshold_ph, lambda: param_noise_threshold)) # Put everything together. deterministic_actions = tf.argmax(q_values_perturbed, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) updates = [ update_eps_expr, tf.cond(reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), lambda: tf.group(*[])), tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)), update_param_noise_threshold_expr, ] _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False}, updates=updates) def act(ob, reset, update_param_noise_threshold, update_param_noise_scale, stochastic=True, update_eps=-1): return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale) return act
def build_train(make_obs_ph, q_func, num_actions, optimizer_f, grad_norm_clipping=None, gamma=1.0, scope="setdeepq", reuse=None, test_eps=0.05, lr_init = 0.001, lr_period_steps=50000, tau=0.05): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. lr_init : float initial learning rate lr_decay_factor : float learning rate decay factor. It should be equal to or smaller than 1.0. lr_growth_factor : float learning rate growth factor. It should be equal to or larger than 1.0. tau : float parameter for the soft target network update. tau <= 1.0 and 1.0 for the hard update. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ # Build action graphs act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) act_greedy = build_act_greedy(make_obs_ph, q_func, num_actions, scope=scope, reuse=True, eps=test_eps) with tf.compat.v1.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.compat.v1.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.compat.v1.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.compat.v1.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.compat.v1.placeholder(tf.float32, [None], name="weight") iteration = tf.compat.v1.placeholder(tf.float32, name="iteration") # Cosine learning rate adjustment lr = tf.Variable(float(lr_init), trainable=False, dtype = tf.float32, name='lr') lr = tf.clip_by_value(0.0005*tf.math.cos(math.pi*iteration/lr_period_steps)+0.000501, 1e-6, 1e-3) optimizer = optimizer_f(learning_rate = lr) # q network evaluation q1_t = q_func(obs_t_input.get(), num_actions, scope="q1_func", reuse=True) # reuse q1 parameters from act q1_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/q1_func") q2_t = q_func(obs_t_input.get(), num_actions, scope="q2_func", reuse=True) # reuse q2 parameters from act q2_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/q2_func") # target q network evalution q1_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q1_func", reuse=False) target_q1_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/target_q1_func") q2_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q2_func", reuse=False) target_q2_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/target_q2_func") # q scores for actions which we know were selected in the given state. q1_t_selected = tf.reduce_sum(input_tensor=q1_t * tf.one_hot(act_t_ph, num_actions), axis=1) q2_t_selected = tf.reduce_sum(input_tensor=q2_t * tf.one_hot(act_t_ph, num_actions), axis=1) # Actions selected with current q funcs at state t+1. q1_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q1_func", reuse=True) q2_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q2_func", reuse=True) tp1_best_action_using_online_net = tf.argmax(input=q1_tp1_using_online_net+q2_tp1_using_online_net, axis=1) # Using action at t+1 find target value associated with the action q1_tp1_selected = tf.reduce_sum(input_tensor=q1_tp1 * tf.one_hot(tp1_best_action_using_online_net, num_actions), axis=1) q2_tp1_selected = tf.reduce_sum(input_tensor=q2_tp1 * tf.one_hot(tp1_best_action_using_online_net, num_actions), axis=1) # Min of target q values to be used bellman equation q_tp1_best = tf.minimum(q1_tp1_selected, q2_tp1_selected) # Done mask # q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_tp1_selected_target = rew_t_ph + gamma * q_tp1_best # compute the error (potentially clipped) td_error1 = q1_t_selected - tf.stop_gradient(q_tp1_selected_target) td_error2 = q2_t_selected - tf.stop_gradient(q_tp1_selected_target) errors1 = U.huber_loss(td_error1) errors2 = U.huber_loss(td_error2) errors = errors1 + errors2 weighted_error = tf.reduce_mean(input_tensor=importance_weights_ph * errors) #Print total number of params total_parameters = 0 for variable in tf.compat.v1.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value # print("var params", variable_parameters) total_parameters += variable_parameters print("===============================================================") print("Total number of trainable params:", total_parameters) print("===============================================================") # Log for tensorboard tf.summary.scalar('q1_values', tf.math.reduce_mean(q1_t)) tf.summary.scalar('q2_values', tf.math.reduce_mean(q2_t)) tf.summary.scalar('td_1', tf.math.reduce_mean(td_error1)) tf.summary.scalar('td_2', tf.math.reduce_mean(td_error2)) tf.summary.scalar('weighted_loss', weighted_error) tf.summary.scalar('lr_schedule', lr) tf.summary.scalar('td_MSE_1', tf.math.reduce_mean(tf.math.square(td_error1))) tf.summary.scalar('td_MSE_2', tf.math.reduce_mean(tf.math.square(td_error2))) # combine variable scopes q_func_vars = q1_func_vars+q2_func_vars # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called every step to copy Q network to target Q network # target network is updated with polyak averaging update_target_expr1 = [] for var, var_target in zip(sorted(q1_func_vars, key=lambda v: v.name), sorted(target_q1_func_vars, key=lambda v: v.name)): update_target_expr1.append(var_target.assign(tau*var + (1-tau)*var_target)) update_target_expr1 = tf.group(*update_target_expr1) update_target_expr2 = [] for var, var_target in zip(sorted(q2_func_vars, key=lambda v: v.name), sorted(target_q2_func_vars, key=lambda v: v.name)): update_target_expr2.append(var_target.assign(tau*var + (1-tau)*var_target)) update_target_expr2 = tf.group(*update_target_expr2) merged_summary = tf.compat.v1.summary.merge_all(scope=tf.compat.v1.get_variable_scope().name) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, iteration ], outputs=[td_error1, td_error2, tf.reduce_mean(input_tensor=errors), merged_summary], updates=[optimize_expr, lr] ) update_target = U.function([], [], updates=[update_target_expr1, update_target_expr2]) q_values = U.function(inputs=[obs_t_input], outputs=[q1_t, q2_t]) return act_f, act_greedy, q_values, train, update_target, {'q_values': q_values}
def build_train(make_obs_ph, q_func, num_actions, optimizer_f, grad_norm_clipping=None, gamma=0.9, scope="deepadfq", reuse=None, varTH=1e-05, test_eps=0.05, act_policy='egreedy', learning_rate=0.001, learning_rate_decay_factor=0.99, learning_rate_growth_factor=1.001): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. varTH : float variance threshold test_eps : float epsilon value for epsilon-greedy method in evaluation act_policy : str either 'egreedy' or 'bayesian' for action policy Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if act_policy == 'egreedy': act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) elif act_policy == 'bayesian': act_f = build_act_bayesian(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) else: raise ValueError( "Please choose either egreedy or bayesian for action policy.") act_test = build_act_greedy(make_obs_ph, q_func, num_actions, scope=scope, reuse=True, eps=test_eps) #act_test = build_act_bayesian(make_obs_ph, q_func, num_actions, scope=scope, reuse=True) sdTH = np.sqrt(varTH, dtype=np.float32) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # Learning rate adjustment lr = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) learning_rate_decay_op = lr.assign( tf.clip_by_value(lr * learning_rate_decay_factor, 1e-5, 1e-3)) learning_rate_growth_op = lr.assign( tf.clip_by_value(lr * learning_rate_growth_factor, 1e-5, 1e-3)) optimizer = optimizer_f(learning_rate=lr) target_means = tf.placeholder(tf.float32, [None], name="target_means") target_sd = tf.placeholder(tf.float32, [None], name="target_sd") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions * 2, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions * 2, scope="target_q_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") mean_values = q_t[:, :num_actions] rho_values = q_t[:, num_actions:] mean_selected = tf.reduce_sum( mean_values * tf.one_hot(act_t_ph, num_actions, dtype=tf.float32), 1) rho_selected = tf.reduce_sum( rho_values * tf.one_hot(act_t_ph, num_actions, dtype=tf.float32), 1) sd_selected = tf.exp(-rho_selected) mean_error = mean_selected - tf.stop_gradient(target_means) #sd_error = sd_selected - tf.stop_gradient(target_sd) sd_error = tf.log(sd_selected) - tf.log(tf.stop_gradient(target_sd)) huber_loss = U.huber_loss(mean_error) + U.huber_loss(sd_error) weighted_loss = tf.reduce_mean(huber_loss * importance_weights_ph) #kl_loss = tf.contrib.distributions.kl_divergence( # tf.distributions.Normal(loc=target_means, scale=target_sd), # tf.distributions.Normal(loc=mean_selected, scale=sd_selected), # name='kl_loss') #weighted_loss = tf.reduce_mean(kl_loss * importance_weights_ph) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_loss, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_loss, var_list=q_func_vars) update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, target_means, target_sd, importance_weights_ph, ], outputs=[tf.reduce_mean(huber_loss), mean_error, sd_error, lr], updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_target_vals = U.function(inputs=[obs_tp1_input], outputs=[q_tp1]) return act_f, act_test, q_target_vals, train, update_target, learning_rate_decay_op, learning_rate_growth_op
def build_train(make_obs_ph, q_func, num_actions, optimizer_f, grad_norm_clipping=None, gamma=1.0, double_q=False, scope="setdeepq", reuse=None, param_noise=False, param_noise_filter_func=None, test_eps=0.05, lr_init=0.001, lr_decay_factor=0.99, lr_growth_factor=1.001, tau=0.001): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. lr_init : float initial learning rate lr_decay_factor : float learning rate decay factor. It should be equal to or smaller than 1.0. lr_growth_factor : float learning rate growth factor. It should be equal to or larger than 1.0. tau : float parameter for the soft target network update. tau <= 1.0 and 1.0 for the hard update. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ # Build action graphs act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) act_greedy = build_act_greedy(make_obs_ph, q_func, num_actions, scope=scope, reuse=True, eps=test_eps) with tf.compat.v1.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.compat.v1.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.compat.v1.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.compat.v1.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.compat.v1.placeholder(tf.float32, [None], name="weight") # Learning rate adjustment lr = tf.Variable(float(lr_init), trainable=False, dtype=tf.float32) lr_decay_op = lr.assign( tf.clip_by_value(lr * lr_decay_factor, 1e-5, 1e-2)) lr_growth_op = lr.assign( tf.clip_by_value(lr * lr_growth_factor, 1e-5, 1e-2)) optimizer = optimizer_f(learning_rate=lr) # q network evaluation q_t = q_func.forward(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func.forward(obs_tp1_input.get(), num_actions, scope="target_q_func", reuse=False) target_q_func_vars = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(input_tensor=q_t * tf.one_hot(act_t_ph, num_actions), axis=1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func.forward(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax( input=q_tp1_using_online_net, axis=1) q_tp1_best = tf.reduce_sum( input_tensor=q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), axis=1) else: q_tp1_best = tf.reduce_max(input_tensor=q_tp1, axis=1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(input_tensor=importance_weights_ph * errors) # Log for tensorboard tf.summary.scalar('q_values', tf.math.reduce_mean(q_t)) tf.summary.scalar('td_MSE', tf.math.reduce_mean(tf.math.square(td_error))) tf.summary.scalar('weighted_loss', weighted_error) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(tau * var + (1 - tau) * var_target)) update_target_expr = tf.group(*update_target_expr) merged_summary = tf.compat.v1.summary.merge_all( scope=tf.compat.v1.get_variable_scope().name) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[ td_error, tf.reduce_mean(input_tensor=errors), merged_summary ], updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function(inputs=[obs_t_input], outputs=q_t) return act_f, act_greedy, q_values, train, update_target, lr_decay_op, lr_growth_op, { 'q_values': q_values }