parser.add_argument('--gpu-id', type=int, default=0, help='epsilon perturbation for attack model.') if __name__ == '__main__': mp.set_start_method("spawn") os.environ['OMP_NUM_THREADS'] = '1' args = parser.parse_args() torch.cuda.set_device(args.gpu_id) torch.manual_seed(args.seed) env = create_atari_env(args.env_name, args) if args.black_box_attack: shared_model = ActorCritic_Substitude(env.observation_space.shape[0], env.action_space) else: shared_model = ActorCritic(env.observation_space.shape[0], env.action_space) shared_model.share_memory() if args.no_shared: optimizer = None else: optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory() # load a pre-trained model according to the ft-setting
def train(rank, args, shared_model, counter, lock, optimizer=None): print('Train with A3C') torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name, args) env.seed(args.seed + rank) model = ActorCritic(env.observation_space.shape[0], env.action_space) if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() output_directory = 'outputs/' + args.env_name checkpoint_directory, result_directory = prepare_sub_folder( output_directory) print(f'checkpoint directory {checkpoint_directory}') time.sleep(10) state = env.reset() state = torch.from_numpy(state) done = True episode_length = 0 total_step = 0 rewards_ep = [] policy_loss_ep = [] value_loss_ep = [] for epoch in range(100000000): # Sync with the shared model model.load_state_dict(shared_model.state_dict()) values = [] log_probs = [] rewards = [] entropies = [] # for step in range(args.num_steps): is_Terminal = False while not is_Terminal: episode_length += 1 total_step += 1 value, logit = model(state.unsqueeze(0)) prob = F.softmax(logit, dim=-1) log_prob = F.log_softmax(logit, dim=-1) entropy = -(log_prob * prob).sum(1, keepdim=True) entropies.append(entropy) action = prob.multinomial(num_samples=1).detach() log_prob = log_prob.gather(1, action) state, reward, done, _ = env.step(action.numpy()) done = done or episode_length >= args.max_episode_length reward = max(min(reward, 1), -1) with lock: counter.value += 1 if done: # print(episode_length) print( f'epoch {epoch} - steps {total_step} - total rewards {np.sum(rewards) + reward}' ) total_step = 1 episode_length = 0 state = env.reset() state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: rewards_ep.append(np.sum(rewards)) is_Terminal = True # break R = torch.zeros(1, 1) if not done: value, _ = model(state.unsqueeze(0)) R = value.detach() values.append(R) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1] - values[i] gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * gae.detach() - args.entropy_coef * entropies[i] optimizer.zero_grad() policy_loss_ep.append(policy_loss.detach().numpy()[0, 0]) value_loss_ep.append(value_loss.detach().numpy()[0, 0]) (policy_loss + args.value_loss_coef * value_loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step() if epoch % 1000 == 0: torch.save({'state_dict': model.state_dict()}, checkpoint_directory + '/' + str(epoch) + ".pth.tar") with open(result_directory + '/' + str(epoch) + '_rewards.pkl', 'wb') as f: pickle.dump(rewards_ep, f) with open(result_directory + '/' + str(epoch) + '_policy_loss.pkl', 'wb') as f: pickle.dump(policy_loss_ep, f) with open(result_directory + '/' + str(epoch) + '_value_loss.pkl', 'wb') as f: pickle.dump(value_loss_ep, f) if episode_length >= 10000000: break torch.save({ 'state_dict': model.state_dict(), }, checkpoint_directory + '/Last' + ".pth.tar")
default='fgsm', help='adversary attack algorithms: fgsm|rand_fgsm|cw2') parser.add_argument('--epsilon-adv', type=float, default=0.003, help='epsilon perturbation for attack model.') opts = parser.parse_args() cudnn.benchmark = True # Load experiment setting config = get_config(opts.config) torch.manual_seed(opts.seed) env = create_atari_env(opts.env_name, opts) trained_model = ActorCritic(env.observation_space.shape[0], env.action_space) # load a pre-trained model according to the ft-setting if opts.ft_setting == 'full-ft': if opts.env_name == 'BreakoutDeterministic-v4': fname = './agent/trained_model/breakout/11000.pth.tar' elif opts.env_name == 'PongDeterministic-v4': fname = './agent/trained_model/pong/4000.pth.tar' else: sys.exit('Only support Break or Pong') if os.path.isfile(fname): checkpoint = torch.load(fname) trained_model.load_state_dict(checkpoint['state_dict']) for param in trained_model.parameters():
def transfer_defense(rank, args, shared_model, counter): torch.manual_seed(args.seed + rank) rl_vaegan_path = 'rl_vaegan/output/' + args.env_name + '/checkpoints' env = create_atari_env(args.env_name, args) '''load trained RL-VAEGAN model''' import rl_vaegan.transfer as t translate_model = t.TransferModel() translate_model.initialize(rl_vaegan_path, arg.which_epoch, args) env.seed(args.seed + rank) if args.black_box_attack: print('Black Box Attack') model = ActorCritic_Substitude(env.observation_space.shape[0], env.action_space) else: print('White Box Attack') model = ActorCritic(env.observation_space.shape[0], env.action_space) if args.test_attacker == 'rand_fgsm': test_alpha_adv = args.test_epsilon_adv * 0.5 print( f'FGSM test on attacker: {args.test_attacker} - epsilon: {args.test_epsilon_adv}' ) if args.test_attacker == 'cw2': test_iteration = 30 else: test_iteration = 30 state = env.reset() state = torch.from_numpy(state).unsqueeze(0).cuda() reward_sum = 0 done = True episode_length = 0 total_step = 0 actions = deque(maxlen=100) reward_ep = [] for epoch in range(test_iteration): model.load_state_dict(shared_model.state_dict()) model.eval().cuda() rewards = [] # for step in range(args.num_steps): is_Terminal = False while not is_Terminal: episode_length += 1 total_step += 1 with torch.no_grad(): value, logit = model(state) prob = F.softmax(logit, dim=-1) action = prob.multinomial(num_samples=1)[0] '''adversarial attack''' if args.variation == 'adversary': if args.test_attacker == 'fgsm': # args.epsilon_adv = random.randint(1,5) * 0.001 state_adv = FGSM(model, name='a3c', eps=args.test_epsilon_adv)._attack( state, action) #(1,3,80,80) elif args.test_attacker == 'rand_fgsm': # args.epsilon_adv = random.randint(2,5) * 0.001 # args.alpha_adv = args.epsilon_adv * 0.5 state_adv = RandFGSM(model, name='a3c', eps=args.test_epsilon_adv, alpha=test_alpha_adv)._attack( state, action) elif args.test_attacker == 'cw2': state_adv = CW2(model, name='a3c')._attack( state, action, env.action_space.n) else: sys.exit('with attacker in (FGSM | Rand+FGSM | CW2) !') '''rl_vaegan style transfer defense''' state_def = translate_model.transform_adv(state_adv) with torch.no_grad(): value_def, logit_def = model(state_def) prob_def = F.softmax(logit_def, dim=-1) action_def = prob_def.multinomial(num_samples=1)[0] state, reward, done, _ = env.step(action_def.item()) done = done or episode_length >= args.max_episode_length actions.append(action_def.item()) # a quick hack to prevent the agent from stucking if actions.count(actions[0]) == actions.maxlen: done = True if done: # print(episode_length) print( f'epoch {epoch} | {test_iteration} - steps {episode_length} - total rewards {np.sum(rewards) + reward}' ) reward_ep.append(np.sum(rewards) + reward) print('episode rewards:', reward_ep, 'avg: ', np.sum(reward_ep) / len(reward_ep)) episode_length = 0 actions.clear() state = env.reset() rewards.append(reward) state = torch.from_numpy(state).unsqueeze(0).cuda() if done: is_Terminal = True print('episode rewards:', reward_ep, 'avg: ', np.sum(reward_ep) / len(reward_ep))