def build_adv(make_obs_tf, q_func, num_actions, epsilon, noisy): with tf.variable_scope('deepq', reuse=tf.AUTO_REUSE): obs_tf_in = U.ensure_tf_input(make_obs_tf("observation")) stochastic_ph_adv = tf.placeholder(tf.bool, (), name="stochastic_adv") update_eps_ph_adv = tf.placeholder(tf.float32, (), name="update_eps_adv") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) update_eps_expr_adv = eps.assign( tf.cond(update_eps_ph_adv >= 0, lambda: update_eps_ph_adv, lambda: eps)) print("==========================================") #def wrapper(x): # return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy) adversary = FastGradientMethod(q_func(obs_tf_in.get(), num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy), sess=U.get_session()) adv_observations = adversary.generate( obs_tf_in.get(), eps=epsilon, clip_min=0, clip_max=1.0) * 255.0 craft_adv_obs = U.function( inputs=[obs_tf_in, stochastic_ph_adv, update_eps_ph_adv], outputs=adv_observations, givens={ update_eps_ph_adv: -1.0, stochastic_ph_adv: True }, updates=[update_eps_expr_adv]) return craft_adv_obs
def build_adv(make_obs_tf, q_func, num_actions, epsilon, noisy, attack=None): with tf.variable_scope('deepq', reuse=tf.AUTO_REUSE): obs_tf_in = U.ensure_tf_input(make_obs_tf("observation")) stochastic_ph_adv = tf.placeholder(tf.bool, (), name="stochastic_adv") update_eps_ph_adv = tf.placeholder(tf.float32, (), name="update_eps_adv") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) update_eps_expr_adv = eps.assign( tf.cond(update_eps_ph_adv >= 0, lambda: update_eps_ph_adv, lambda: eps)) print("==========================================") #def wrapper(x): # return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy) if attack == 'fgsm': adversary = FastGradientMethod(q_func(obs_tf_in.get(), num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy), sess=U.get_session()) adv_observations = adversary.generate( obs_tf_in.get(), eps=epsilon, clip_min=0, clip_max=1.0) * 255.0 print("----") print(adv_observations.shape) else: adversary = CarliniWagnerL2(q_func(obs_tf_in.get(), num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy), sess=U.get_session()) cw_params = { 'binary_search_steps': 1, 'max_iterations': 100, 'learning_rate': 0.1, 'initial_const': 10, 'clip_min': 0, 'clip_max': 1.0 } adv_observations = adversary.generate(obs_tf_in.get(), ** cw_params) * 255.0 # saveScreenPNG(b'test_image.png') craft_adv_obs = U.function( inputs=[obs_tf_in, stochastic_ph_adv, update_eps_ph_adv], outputs=adv_observations, givens={ update_eps_ph_adv: -1.0, stochastic_ph_adv: True }, updates=[update_eps_expr_adv]) return craft_adv_obs
def build_act(make_obs_ph, q_func, num_actions, noisy=False, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ with tf.variable_scope(scope, reuse=reuse): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) q_values = q_func(observations_ph.get(), num_actions, scope="q_func", noisy=noisy) q_values = q_values.get_logits(observations_ph.get()) deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) return act
def build_act_enjoy (make_obs_ph, q_func, num_actions, noisy=False, scope="deepq", reuse=None, attack=None, model_path=''): with tf.variable_scope(scope, reuse=reuse): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) q_values = q_func(observations_ph.get(), num_actions, scope="q_func", noisy=noisy) deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) # Load model before attacks graph construction so that TF won't # complain can't load parameters for attack try: U.load_state(model_path) except: pass if attack != None: if attack == 'fgsm': def wrapper(x): return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy) adversary = FastGradientMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session()) adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0, clip_min=0, clip_max=1.0) * 255.0 elif attack == 'iterative': def wrapper(x): return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True) adversary = BasicIterativeMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session()) adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0, clip_min=0, clip_max=1.0) * 255.0 elif attack == 'cwl2': def wrapper(x): return q_func(x, num_actions, scope="q_func", reuse=True) adversary = CarliniWagnerL2(CallableModelWrapper(wrapper, 'logits'), sess=U.get_session()) cw_params = {'binary_search_steps': 1, 'max_iterations': 100, 'learning_rate': 0.1, 'initial_const': 10, 'clip_min': 0, 'clip_max': 1.0} adv_observations = adversary.generate(observations_ph.get(), **cw_params) * 255.0 craft_adv_obs = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=adv_observations, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) if attack == None: craft_adv_obs = None return act else: return act, craft_adv_obs
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, noisy=False, scope="deepq", reuse=None, attack=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, noisy=noisy, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", noisy=noisy, reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func", noisy=noisy) target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", noisy=noisy, reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[td_error, errors], updates=[optimize_expr] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) ################## Vahid's Work ################### #U.load_state(model_path) if attack != None: if attack == 'fgsm': def wrapper(x): return q_func(x, num_actions, scope="target_q_func", reuse=True, concat_softmax=True, noisy=noisy) adversary = FastGradientMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session()) adv_observations = adversary.generate(obs_tp1_input.get(), eps=1.0/255.0, clip_min=0, clip_max=1.0) * 255.0 elif attack == 'iterative': def wrapper(x): return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True) adversary = BasicIterativeMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session()) adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0, clip_min=0, clip_max=1.0) * 255.0 elif attack == 'cwl2': def wrapper(x): return q_func(x, num_actions, scope="q_func", reuse=True) adversary = CarliniWagnerL2(CallableModelWrapper(wrapper, 'logits'), sess=U.get_session()) cw_params = {'binary_search_steps': 1, 'max_iterations': 100, 'learning_rate': 0.1, 'initial_const': 10, 'clip_min': 0, 'clip_max': 1.0} adv_observations = adversary.generate(observations_ph.get(), **cw_params) * 255.0 craft_adv_obs = U.function(inputs=[obs_tp1_input], outputs=adv_observations, updates=[update_target_expr]) if attack == None: craft_adv_obs = None return act_f, train, update_target, {'q_values': q_values}, craft_adv_obs