def test_MpiAdam(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2, 5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) do_update = U.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) for i in range(10): print(i, do_update()) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a, b] lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) adam = MpiAdam(var_list) for i in range(10): l, g = lossandgrad() adam.update(g, stepsize) print(i, l)
def validate_probtype(probtype, pdparam): N = 100000 # Check to see if mean negative log likelihood == differential entropy Mval = np.repeat(pdparam[None, :], N, axis=0) M = probtype.param_placeholder([N]) X = probtype.sample_placeholder([N]) pd = probtype.pdclass()(M) calcloglik = U.function([X, M], pd.logp(X)) calcent = U.function([M], pd.entropy()) Xval = U.eval(pd.sample(), feed_dict={M: Mval}) logliks = calcloglik(Xval, Mval) entval_ll = -logliks.mean() #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval = calcent(Mval).mean() #pylint: disable=E1101 assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] M2 = probtype.param_placeholder([N]) pd2 = probtype.pdclass()(M2) q = pdparam + np.random.randn(pdparam.size) * 0.1 Mval2 = np.repeat(q[None, :], N, axis=0) calckl = U.function([M, M2], pd.kl(pd2)) klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 logliks = calcloglik(Xval, Mval2) klval_ll = -entval - logliks.mean() #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
def build_adv(make_obs_tf, q_func, num_actions, epsilon, noisy): with tf.variable_scope('deepq', reuse=tf.AUTO_REUSE): obs_tf_in = U.ensure_tf_input(make_obs_tf("observation")) stochastic_ph_adv = tf.placeholder(tf.bool, (), name="stochastic_adv") update_eps_ph_adv = tf.placeholder(tf.float32, (), name="update_eps_adv") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) update_eps_expr_adv = eps.assign( tf.cond(update_eps_ph_adv >= 0, lambda: update_eps_ph_adv, lambda: eps)) print("==========================================") #def wrapper(x): # return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy) adversary = FastGradientMethod(q_func(obs_tf_in.get(), num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy), sess=U.get_session()) adv_observations = adversary.generate( obs_tf_in.get(), eps=epsilon, clip_min=0, clip_max=1.0) * 255.0 craft_adv_obs = U.function( inputs=[obs_tf_in, stochastic_ph_adv, update_eps_ph_adv], outputs=adv_observations, givens={ update_eps_ph_adv: -1.0, stochastic_ph_adv: True }, updates=[update_eps_expr_adv]) return craft_adv_obs
def build_adv(make_obs_tf, q_func, num_actions, epsilon, noisy, attack=None): with tf.variable_scope('deepq', reuse=tf.AUTO_REUSE): obs_tf_in = U.ensure_tf_input(make_obs_tf("observation")) stochastic_ph_adv = tf.placeholder(tf.bool, (), name="stochastic_adv") update_eps_ph_adv = tf.placeholder(tf.float32, (), name="update_eps_adv") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) update_eps_expr_adv = eps.assign( tf.cond(update_eps_ph_adv >= 0, lambda: update_eps_ph_adv, lambda: eps)) print("==========================================") #def wrapper(x): # return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy) if attack == 'fgsm': adversary = FastGradientMethod(q_func(obs_tf_in.get(), num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy), sess=U.get_session()) adv_observations = adversary.generate( obs_tf_in.get(), eps=epsilon, clip_min=0, clip_max=1.0) * 255.0 print("----") print(adv_observations.shape) else: adversary = CarliniWagnerL2(q_func(obs_tf_in.get(), num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy), sess=U.get_session()) cw_params = { 'binary_search_steps': 1, 'max_iterations': 100, 'learning_rate': 0.1, 'initial_const': 10, 'clip_min': 0, 'clip_max': 1.0 } adv_observations = adversary.generate(obs_tf_in.get(), ** cw_params) * 255.0 # saveScreenPNG(b'test_image.png') craft_adv_obs = U.function( inputs=[obs_tf_in, stochastic_ph_adv, update_eps_ph_adv], outputs=adv_observations, givens={ update_eps_ph_adv: -1.0, stochastic_ph_adv: True }, updates=[update_eps_expr_adv]) return craft_adv_obs
def build_act(make_obs_ph, q_func, num_actions, noisy=False, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ with tf.variable_scope(scope, reuse=reuse): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) q_values = q_func(observations_ph.get(), num_actions, scope="q_func", noisy=noisy) q_values = q_values.get_logits(observations_ph.get()) deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) return act
def test_function(): tf.reset_default_graph() x = tf.placeholder(tf.int32, (), name="x") y = tf.placeholder(tf.int32, (), name="y") z = 3 * x + 2 * y lin = function([x, y], z, givens={y: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(x=3) == 9 assert lin(2, 2) == 10 assert lin(x=2, y=3) == 12
def test_multikwargs(): tf.reset_default_graph() x = tf.placeholder(tf.int32, (), name="x") with tf.variable_scope("other"): x2 = tf.placeholder(tf.int32, (), name="x") z = 3 * x + 2 * x2 lin = function([x, x2], z, givens={x2: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10 expt_caught = False try: lin(x=2) except AssertionError: expt_caught = True assert expt_caught
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable(dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2)) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function( [newsum, newsumsq, newcount], [], updates=[ tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount) ])
iteration_time_est = RunningAvg(0.999) obs = env.reset() # Record the mean of the \sigma sigma_name_list = [] sigma_list = [] for param in tf.trainable_variables(): # only record the \sigma in the action network if 'sigma' in param.name \ and 'deepq/q_func/action_value' in param.name: summary_name = \ param.name.replace( 'deepq/q_func/action_value/', '').replace( '/', '.').split(':')[0] sigma_name_list.append(summary_name) sigma_list.append(tf.reduce_mean(tf.abs(param))) f_mean_sigma = U.function(inputs=[], outputs=sigma_list) # Statistics writer = tf.summary.FileWriter(savedir, sess.graph) im_stats = statistics( scalar_keys=['action', 'im_reward', 'td_errors', 'huber_loss'] + sigma_name_list) ep_stats = statistics(scalar_keys=['ep_reward', 'ep_length']) # Main trianing loop ep_length = 0 while True: num_iters += 1 ep_length += 1 # V: Perturb observation if we are past the init stage # and at a designated attack step # if craft_adv != None and (num_iters >= args.attack_init)
def build_act_enjoy (make_obs_ph, q_func, num_actions, noisy=False, scope="deepq", reuse=None, attack=None, model_path=''): with tf.variable_scope(scope, reuse=reuse): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) q_values = q_func(observations_ph.get(), num_actions, scope="q_func", noisy=noisy) deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) # Load model before attacks graph construction so that TF won't # complain can't load parameters for attack try: U.load_state(model_path) except: pass if attack != None: if attack == 'fgsm': def wrapper(x): return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy) adversary = FastGradientMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session()) adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0, clip_min=0, clip_max=1.0) * 255.0 elif attack == 'iterative': def wrapper(x): return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True) adversary = BasicIterativeMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session()) adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0, clip_min=0, clip_max=1.0) * 255.0 elif attack == 'cwl2': def wrapper(x): return q_func(x, num_actions, scope="q_func", reuse=True) adversary = CarliniWagnerL2(CallableModelWrapper(wrapper, 'logits'), sess=U.get_session()) cw_params = {'binary_search_steps': 1, 'max_iterations': 100, 'learning_rate': 0.1, 'initial_const': 10, 'clip_min': 0, 'clip_max': 1.0} adv_observations = adversary.generate(observations_ph.get(), **cw_params) * 255.0 craft_adv_obs = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=adv_observations, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) if attack == None: craft_adv_obs = None return act else: return act, craft_adv_obs
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, noisy=False, scope="deepq", reuse=None, attack=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, noisy=noisy, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", noisy=noisy, reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func", noisy=noisy) target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", noisy=noisy, reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[td_error, errors], updates=[optimize_expr] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) ################## Vahid's Work ################### #U.load_state(model_path) if attack != None: if attack == 'fgsm': def wrapper(x): return q_func(x, num_actions, scope="target_q_func", reuse=True, concat_softmax=True, noisy=noisy) adversary = FastGradientMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session()) adv_observations = adversary.generate(obs_tp1_input.get(), eps=1.0/255.0, clip_min=0, clip_max=1.0) * 255.0 elif attack == 'iterative': def wrapper(x): return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True) adversary = BasicIterativeMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session()) adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0, clip_min=0, clip_max=1.0) * 255.0 elif attack == 'cwl2': def wrapper(x): return q_func(x, num_actions, scope="q_func", reuse=True) adversary = CarliniWagnerL2(CallableModelWrapper(wrapper, 'logits'), sess=U.get_session()) cw_params = {'binary_search_steps': 1, 'max_iterations': 100, 'learning_rate': 0.1, 'initial_const': 10, 'clip_min': 0, 'clip_max': 1.0} adv_observations = adversary.generate(observations_ph.get(), **cw_params) * 255.0 craft_adv_obs = U.function(inputs=[obs_tp1_input], outputs=adv_observations, updates=[update_target_expr]) if attack == None: craft_adv_obs = None return act_f, train, update_target, {'q_values': q_values}, craft_adv_obs
iteration_time_est = RunningAvg(0.999) obs = env.reset() # Record the mean of the \sigma sigma_name_list = [] sigma_list = [] for param in tf.trainable_variables(): # only record the \sigma in the action network if 'sigma' in param.name \ and 'deepq/q_func/action_value' in param.name: summary_name = \ param.name.replace( 'deepq/q_func/action_value/', '').replace( '/', '.').split(':')[0] sigma_name_list.append(summary_name) sigma_list.append(tf.reduce_mean(tf.abs(param))) f_mean_sigma = U.function(inputs=[], outputs=sigma_list) # Statistics writer = tf.summary.FileWriter(savedir, sess.graph) im_stats = statistics(scalar_keys=['action', 'im_reward', 'td_errors', 'huber_loss'] + sigma_name_list) ep_stats = statistics(scalar_keys=['ep_reward', 'ep_length']) # Main trianing loop ep_length = 0 while True: num_iters += 1 ep_length += 1 # V: Perturb observation if we are past the init stage # and at a designated attack step # if craft_adv != None and (num_iters >= args.attack_init) # and ((num_iters - args.attack_init) % args.attack_freq == 0) :