def create_super_mario_env_stage1(name='SuperMarioBrosRandomStage1-v1'): import gym from nes_py.wrappers import JoypadSpace from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT import gym_super_mario_bros stage_names = [ 'SuperMarioBros-1-1-v1', 'SuperMarioBros-1-2-v1', 'SuperMarioBros-1-3-v1', 'SuperMarioBros-1-4-v1', ] env = gym_super_mario_bros.make(stage_names[1]) env = JoypadSpace(env, SIMPLE_MOVEMENT) env = wrappers.MaxAndSkipEnv(env, skip=4) env = wrappers.wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) #env = wrappers.AllowBacktracking(env) return env
def mini_test(model, config, logger, dtype, num_episodes=10, max_frames_per_episode=30000): logger.log('start mini test') training_config = config['training_config'] env_params = training_config['env_params'] env_params['clip_rewards'] = False env_params['episode_life'] = False env_id = config['env_id'] if 'NoFrameskip' not in env_id: env = make_atari_cart(env_id) else: env = make_atari(env_id) env = wrap_deepmind(env, **env_params) env = wrap_pytorch(env) state = env.reset() all_rewards = [] episode_reward = 0 seed = random.randint(0, sys.maxsize) logger.log('reseting env with seed', seed) env.seed(seed) state = env.reset() episode_idx = 1 this_episode_frame = 1 for frame_idx in range(1, num_episodes * max_frames_per_episode + 1): state_tensor = torch.from_numpy(np.ascontiguousarray(state)).unsqueeze(0).cuda().to(torch.float32) if dtype in UINTS: state_tensor /= 255 action = model.act(state_tensor)[0] next_state, reward, done, _ = env.step(action) # logger.log(action) state = next_state episode_reward += reward if this_episode_frame == max_frames_per_episode: logger.log('maximum number of frames reached in this episode, reset environment!') done = True if done: logger.log('reseting env with seed', seed) state = env.reset() all_rewards.append(episode_reward) logger.log('episode {}/{} reward: {:6g}'.format(episode_idx, num_episodes, all_rewards[-1])) episode_reward = 0 this_episode_frame = 1 episode_idx += 1 if episode_idx > num_episodes: break else: this_episode_frame += 1 return np.mean(all_rewards)
def create_super_mario_env(name='SuperMarioBros-v1'): import gym from nes_py.wrappers import JoypadSpace from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT import gym_super_mario_bros env = gym_super_mario_bros.make(name) env = JoypadSpace(env, SIMPLE_MOVEMENT) env = wrappers.MaxAndSkipEnv(env, skip=4) env = wrappers.wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) return env
def worker_initializer(env_id, env_params, seed, save_frames=False): from common.wrappers import make_atari, make_atari_cart, wrap_deepmind, wrap_pytorch from setproctitle import setproctitle global env, return_unprocessed return_unprocessed = save_frames setproctitle('atari-env') if "NoFrameskip" not in env_id: env = make_atari_cart(env_id) else: env = make_atari(env_id) env = wrap_deepmind(env, **env_params) env = wrap_pytorch(env) random.seed(seed) seed = random.randint(0, sys.maxsize) print('reseting env with seed', seed, 'in initializer') env.seed(seed) state = env.reset() env.seed(seed) print('state shape', state.shape)
config.win_break = True config.prioritized_replay = args.prioritized_replay prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_eps = 1e-6 if args.env == 'PongNoFrameskip-v4': config.win_reward = 17 elif args.env == 'BreakoutNoFrameskip-v4': config.win_reward = 200 elif args.env == 'BoxingNoFrameskip-v4': config.win_reward = 200 # handle the atari env env = make_atari(config.env) env = wrap_deepmind(env) env = wrap_pytorch(env) config.action_dim = env.action_space.n config.state_shape = env.observation_space.shape agent = CnnDDQNAgent(config) if args.train: trainer = Trainer(agent, env, config) trainer.train() elif args.test: if args.model_path is None: print('please add the model path:', '--model_path xxxx') exit(0) tester = Tester(agent, env, args.model_path)
import torch.nn.functional as F import matplotlib.pyplot as plt # Device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Using: ', device) # env = gym.envs.make("BreakoutNoFrameskip-v4") from common.wrappers import make_atari, wrap_deepmind, wrap_pytorch from collections import deque env_id = "BreakoutNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env, frame_stack=True) env = wrap_pytorch(env) # Replay Buffer class ReplayBuffer(object): def __init__(self, capacity): self.buffer = deque(maxlen=capacity) def push(self, state, action, reward, next_state, done): state = np.expand_dims(state, 0) next_state = np.expand_dims(next_state, 0) self.buffer.append((state, action, reward, next_state, done)) def sample(self, batch_size):
def _thunk(): # env = gym.make(env_name) env = make_atari(env_name) env = wrap_deepmind(env, frame_stack=True) env = wrap_pytorch(env) return env
def make_env(): def _thunk(): # env = gym.make(env_name) env = make_atari(env_name) env = wrap_deepmind(env, frame_stack=True) env = wrap_pytorch(env) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env2 = make_atari(env_name) env2 = wrap_deepmind(env2, frame_stack=True) env2 = wrap_pytorch(env2) class ActorCritic(nn.Module): def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0): super(ActorCritic, self).__init__() self.features = nn.Sequential( nn.Conv2d(4, 32, kernel_size=8, stride=4), nn.ReLU(), nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(), nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU()) self.critic = nn.Sequential(nn.Linear(3136, hidden_size), nn.ReLU(), nn.Linear(hidden_size, 1))
def main(args): config = load_config(args) prefix = config['env_id'] training_config = config['training_config'] if config['name_suffix']: prefix += config['name_suffix'] if config['path_prefix']: prefix = os.path.join(config['path_prefix'], prefix) if not os.path.exists(prefix): os.makedirs(prefix) train_log = os.path.join(prefix, 'train.log') logger = Logger(open(train_log, "w")) logger.log('Command line:', " ".join(sys.argv[:])) logger.log(args) logger.log(config) env_params = training_config['env_params'] env_id = config['env_id'] if "NoFrameskip" not in env_id: env = make_atari_cart(env_id) else: env = make_atari(env_id) env = wrap_deepmind(env, **env_params) env = wrap_pytorch(env) seed = training_config['seed'] env.seed(seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) state = env.reset() dtype = state.dtype logger.log("env_shape: {}, num of actions: {}".format( env.observation_space.shape, env.action_space.n)) if "NoFrameskip" in env_id: logger.log('action meaning:', env.unwrapped.get_action_meanings()[:env.action_space.n]) robust = training_config.get('robust', False) adv_train = training_config.get('adv_train', False) bound_solver = training_config.get('bound_solver', 'cov') attack_config = {} if adv_train or bound_solver == 'pgd': test_config = config['test_config'] attack_config = training_config["attack_config"] adv_ratio = training_config.get('adv_ratio', 1) if adv_train: logger.log('using adversarial examples for training, adv ratio:', adv_ratio) else: logger.log('using pgd regularization training') if robust or adv_train: schedule_start = training_config['schedule_start'] schedule_length = training_config['schedule_length'] starting_epsilon = training_config['start_epsilon'] end_epsilon = training_config['epsilon'] epsilon_scheduler = EpsilonScheduler( training_config.get("schedule_type", "linear"), schedule_start, schedule_start + schedule_length - 1, starting_epsilon, end_epsilon, 1) max_eps = end_epsilon model_width = training_config['model_width'] robust_model = robust and bound_solver != 'pgd' dueling = training_config.get('dueling', True) current_model = model_setup(env_id, env, robust_model, logger, USE_CUDA, dueling, model_width) target_model = model_setup(env_id, env, robust_model, logger, USE_CUDA, dueling, model_width) load_path = training_config["load_model_path"] if load_path != "" and os.path.exists(load_path): load_frame = int(re.findall('^.*frame_([0-9]+).pth$', load_path)[0]) logger.log('\ntrain from model {}, current frame index is {}\n'.format( load_path, load_frame)) current_model.features.load_state_dict(torch.load(load_path)) target_model.features.load_state_dict(torch.load(load_path)) else: logger.log('\ntrain from scratch') load_frame = 1 lr = training_config['lr'] grad_clip = training_config['grad_clip'] natural_loss_fn = training_config['natural_loss_fn'] optimizer = optim.Adam(current_model.parameters(), lr=lr, eps=training_config['adam_eps']) # Do not evaluate gradient for target model. for param in target_model.features.parameters(): param.requires_grad = False buffer_config = training_config['buffer_params'] replay_initial = buffer_config['replay_initial'] buffer_capacity = buffer_config['buffer_capacity'] use_cpp_buffer = training_config["cpprb"] use_async_rb = training_config['use_async_rb'] num_frames = training_config['num_frames'] batch_size = training_config['batch_size'] gamma = training_config['gamma'] if use_cpp_buffer: logger.log('using cpp replay buffer') if use_async_rb: replay_buffer_ctor = AsyncReplayBuffer(initial_state=state, batch_size=batch_size) else: replay_buffer_ctor = cpprb.PrioritizedReplayBuffer else: logger.log('using python replay buffer') per = training_config['per'] if per: logger.log('using prioritized experience replay.') alpha = buffer_config['alpha'] buffer_beta_start = buffer_config['buffer_beta_start'] buffer_beta_frames = buffer_config.get('buffer_beta_frames', -1) if buffer_beta_frames < replay_initial: buffer_beta_frames = num_frames - replay_initial logger.log('beffer_beta_frames reset to ', buffer_beta_frames) buffer_beta_scheduler = BufferBetaScheduler(buffer_beta_start, buffer_beta_frames, start_frame=replay_initial) if use_cpp_buffer: replay_buffer = replay_buffer_ctor( size=buffer_capacity, # env_dict={"obs": {"shape": state.shape, "dtype": np.uint8}, env_dict={ "obs": { "shape": state.shape, "dtype": dtype }, "act": { "shape": 1, "dtype": np.uint8 }, "rew": {}, # "next_obs": {"shape": state.shape, "dtype": np.uint8}, "next_obs": { "shape": state.shape, "dtype": dtype }, "done": {} }, alpha=alpha, eps=0.0) # We add eps manually in training loop else: replay_buffer = PrioritizedReplayBuffer(buffer_capacity, alpha=alpha) else: logger.log('using regular replay.') if use_cpp_buffer: replay_buffer = cpprb.ReplayBuffer( buffer_capacity, # {"obs": {"shape": state.shape, "dtype": np.uint8}, { "obs": { "shape": state.shape, "dtype": dtype }, "act": { "shape": 1, "dtype": np.uint8 }, "rew": {}, # "next_obs": {"shape": state.shape, "dtype": np.uint8}, "next_obs": { "shape": state.shape, "dtype": dtype }, "done": {} }) else: replay_buffer = ReplayBuffer(buffer_capacity) update_target(current_model, target_model) act_epsilon_start = training_config['act_epsilon_start'] act_epsilon_final = training_config['act_epsilon_final'] act_epsilon_decay = training_config['act_epsilon_decay'] act_epsilon_method = training_config['act_epsilon_method'] if training_config.get('act_epsilon_decay_zero', True): decay_zero = num_frames else: decay_zero = None act_epsilon_scheduler = ActEpsilonScheduler(act_epsilon_start, act_epsilon_final, act_epsilon_decay, method=act_epsilon_method, start_frame=replay_initial, decay_zero=decay_zero) # Use optimized cuda memory management memory_mgr = CudaTensorManager(state.shape, batch_size, per, USE_CUDA, dtype=dtype) losses = [] td_losses = [] batch_cur_q = [] batch_exp_q = [] sa = None kappa = None hinge = False if robust: logger.log( 'using convex relaxation certified classification loss as a regularization!' ) kappa = training_config['kappa'] reg_losses = [] sa = np.zeros( (current_model.num_actions, current_model.num_actions - 1), dtype=np.int32) for i in range(sa.shape[0]): for j in range(sa.shape[1]): if j < i: sa[i][j] = j else: sa[i][j] = j + 1 sa = torch.LongTensor(sa) hinge = training_config.get('hinge', False) logger.log('using hinge loss (default is cross entropy): ', hinge) if training_config['use_async_env']: # Create an environment in a separate process, run asychronously async_env = AsyncEnv(env_id, result_path=prefix, draw=training_config['show_game'], record=training_config['record_game'], env_params=env_params, seed=seed) # initialize parameters in logging all_rewards = [] episode_reward = 0 act_epsilon = np.nan grad_norm = np.nan weights_norm = np.nan best_test_reward = -float('inf') buffer_stored_size = 0 if adv_train: attack_count = 0 suc_count = 0 if robust and bound_solver == 'pgd': ori_margin, adv_margin = np.nan, np.nan start_time = time.time() period_start_time = time.time() # Main Loop for frame_idx in range(load_frame, num_frames + 1): # Step 1: get current action frame_start = time.time() t = time.time() eps = 0 if adv_train or robust: eps = epsilon_scheduler.get_eps(frame_idx, 0) act_epsilon = act_epsilon_scheduler.get(frame_idx) if adv_train and eps != np.nan and eps >= np.finfo(np.float32).tiny: ori_state_tensor = torch.from_numpy( np.ascontiguousarray(state)).unsqueeze(0).cuda().to( torch.float32) if dtype in UINTS: ori_state_tensor /= 255 attack_config['params']['epsilon'] = eps if random.random() < adv_ratio: attack_count += 1 state_tensor = attack(current_model, ori_state_tensor, attack_config) if current_model.act(state_tensor)[0] != current_model.act( ori_state_tensor)[0]: suc_count += 1 else: state_tensor = ori_state_tensor action = current_model.act(state_tensor, act_epsilon)[0] else: with torch.no_grad(): state_tensor = torch.from_numpy( np.ascontiguousarray(state)).unsqueeze(0).cuda().to( torch.float32) if dtype in UINTS: state_tensor /= 255 ori_state_tensor = torch.clone(state_tensor) action = current_model.act(state_tensor, act_epsilon)[0] # torch.cuda.synchronize() log_time('act_time', time.time() - t) # Step 2: run environment t = time.time() if training_config['use_async_env']: async_env.async_step(action) else: next_state, reward, done, _ = env.step(action) log_time('env_time', time.time() - t) # Step 3: save to buffer # For asynchronous env, defer saving if not training_config['use_async_env']: t = time.time() if use_cpp_buffer: replay_buffer.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done) else: replay_buffer.push(state, action, reward, next_state, done) log_time('save_time', time.time() - t) if use_cpp_buffer: buffer_stored_size = replay_buffer.get_stored_size() else: buffer_stored_size = len(replay_buffer) beta = np.nan buffer_beta = np.nan t = time.time() if buffer_stored_size > replay_initial: if training_config['per']: buffer_beta = buffer_beta_scheduler.get(frame_idx) if robust: convex_final_beta = training_config['convex_final_beta'] convex_start_beta = training_config['convex_start_beta'] beta = ( max_eps - eps * (1.0 - convex_final_beta)) / max_eps * convex_start_beta res = compute_td_loss(current_model, target_model, batch_size, replay_buffer, per, use_cpp_buffer, use_async_rb, optimizer, gamma, memory_mgr, robust, buffer_beta=buffer_beta, grad_clip=grad_clip, natural_loss_fn=natural_loss_fn, eps=eps, beta=beta, sa=sa, kappa=kappa, dtype=dtype, hinge=hinge, hinge_c=training_config.get('hinge_c', 1), env_id=env_id, bound_solver=bound_solver, attack_config=attack_config) loss, grad_norm, weights_norm, td_loss, batch_cur_q_value, batch_exp_q_value = res[ 0], res[1], res[2], res[3], res[4], res[5] if robust: reg_loss = res[-1] reg_losses.append(reg_loss.data.item()) if bound_solver == 'pgd': ori_margin, adv_margin = res[-3].data.item( ), res[-2].data.item() losses.append(loss.data.item()) td_losses.append(td_loss.data.item()) batch_cur_q.append(batch_cur_q_value.data.item()) batch_exp_q.append(batch_exp_q_value.data.item()) log_time('loss_time', time.time() - t) # Step 2: run environment (async) t = time.time() if training_config['use_async_env']: next_state, reward, done, _ = async_env.wait_step() log_time('env_time', time.time() - t) # Step 3: save to buffer (async) if training_config['use_async_env']: t = time.time() if use_cpp_buffer: replay_buffer.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done) else: replay_buffer.push(state, action, reward, next_state, done) log_time('save_time', time.time() - t) # Update states and reward t = time.time() state = next_state episode_reward += reward if done: if training_config['use_async_env']: state = async_env.reset() else: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 log_time('env_time', time.time() - t) # All kinds of result logging if frame_idx % training_config[ 'print_frame'] == 0 or frame_idx == num_frames or ( robust and abs(frame_idx - schedule_start) < 5 ) or abs(buffer_stored_size - replay_initial) < 5: logger.log( '\nframe {}/{}, learning rate: {:.6g}, buffer beta: {:.6g}, action epsilon: {:.6g}' .format(frame_idx, num_frames, lr, buffer_beta, act_epsilon)) logger.log( 'total time: {:.2f}, epoch time: {:.4f}, speed: {:.2f} frames/sec, last total loss: {:.6g}, avg total loss: {:.6g}, grad norm: {:.6g}, weights_norm: {:.6g}, latest episode reward: {:.6g}, avg 10 episode reward: {:.6g}' .format( time.time() - start_time, time.time() - period_start_time, training_config['print_frame'] / (time.time() - period_start_time), losses[-1] if losses else np.nan, np.average(losses[:-training_config['print_frame'] - 1:-1]) if losses else np.nan, grad_norm, weights_norm, all_rewards[-1] if all_rewards else np.nan, np.average(all_rewards[:-11:-1]) if all_rewards else np.nan)) logger.log('last td loss: {:.6g}, avg td loss: {:.6g}'.format( td_losses[-1] if td_losses else np.nan, np.average(td_losses[:-training_config['print_frame'] - 1:-1]) if td_losses else np.nan)) logger.log( 'last batch cur q: {:.6g}, avg batch cur q: {:.6g}'.format( batch_cur_q[-1] if batch_cur_q else np.nan, np.average(batch_cur_q[:-training_config['print_frame'] - 1:-1]) if batch_cur_q else np.nan)) logger.log( 'last batch exp q: {:.6g}, avg batch exp q: {:.6g}'.format( batch_exp_q[-1] if batch_exp_q else np.nan, np.average(batch_exp_q[:-training_config['print_frame'] - 1:-1]) if batch_exp_q else np.nan)) if robust: logger.log('current input epsilon: {:.6g}'.format(eps)) if bound_solver == 'pgd': logger.log( 'last logit margin: ori: {:.6g}, adv: {:.6g}'.format( ori_margin, adv_margin)) else: logger.log('current bound beta: {:.6g}'.format(beta)) logger.log( 'last cert reg loss: {:.6g}, avg cert reg loss: {:.6g}'. format( reg_losses[-1] if reg_losses else np.nan, np.average( reg_losses[:-training_config['print_frame'] - 1:-1]) if reg_losses else np.nan)) logger.log('current kappa: {:.6g}'.format(kappa)) if adv_train: logger.log( 'current attack epsilon (same as input epsilon): {:.6g}'. format(eps)) diff = ori_state_tensor - state_tensor diff = np.abs(diff.data.cpu().numpy()) logger.log('current Linf distortion: {:.6g}'.format( np.max(diff))) logger.log( 'this batch attacked: {}, success: {}, attack success rate: {:.6g}' .format( attack_count, suc_count, suc_count * 1.0 / attack_count if attack_count > 0 else np.nan)) attack_count = 0 suc_count = 0 logger.log('attack stats reseted.') period_start_time = time.time() log_time.print() log_time.clear() if frame_idx % training_config[ 'save_frame'] == 0 or frame_idx == num_frames: plot(frame_idx, all_rewards, losses, prefix) torch.save(current_model.features.state_dict(), '{}/frame_{}.pth'.format(prefix, frame_idx)) if frame_idx % training_config['update_target_frame'] == 0: update_target(current_model, target_model) if frame_idx % training_config.get('mini_test', 100000) == 0 and ( (robust and beta == 0) or (not robust and frame_idx * 1.0 / num_frames >= 0.8)): test_reward = mini_test(current_model, config, logger, dtype) logger.log('this test avg reward: {:6g}'.format(test_reward)) if test_reward >= best_test_reward: best_test_reward = test_reward logger.log( 'new best reward {:6g} achieved, update checkpoint'.format( test_reward)) torch.save(current_model.features.state_dict(), '{}/best_frame_{}.pth'.format(prefix, frame_idx)) log_time.log_time('total', time.time() - frame_start)
def main(args): config = load_config(args) prefix = config['env_id'] training_config = config['training_config'] test_config = config['test_config'] attack_config = test_config["attack_config"] if config['name_suffix']: prefix += config['name_suffix'] if config['path_prefix']: prefix = os.path.join(config['path_prefix'], prefix) if 'load_model_path' in test_config and os.path.isfile( test_config['load_model_path']): if not os.path.exists(prefix): os.makedirs(prefix) test_log = os.path.join(prefix, test_config['log_name']) else: if os.path.exists(prefix): test_log = os.path.join(prefix, test_config['log_name']) else: raise ValueError( 'Path {} not exists, please specify test model path.') logger = Logger(open(test_log, "w")) logger.log('Command line:', " ".join(sys.argv[:])) logger.log(args) logger.log(config) certify = test_config.get('certify', False) env_params = training_config['env_params'] env_params['clip_rewards'] = False env_params['episode_life'] = False env_id = config['env_id'] if "NoFrameskip" not in env_id: env = make_atari_cart(env_id) else: env = make_atari(env_id) env = wrap_deepmind(env, **env_params) env = wrap_pytorch(env) state = env.reset() dtype = state.dtype logger.log("env_shape: {}, num of actions: {}".format( env.observation_space.shape, env.action_space.n)) model_width = training_config['model_width'] robust_model = certify dueling = training_config.get('dueling', True) model = model_setup(env_id, env, robust_model, logger, USE_CUDA, dueling, model_width) if 'load_model_path' in test_config and os.path.isfile( test_config['load_model_path']): model_path = test_config['load_model_path'] else: logger.log("choosing the best model from " + prefix) all_idx = [ int(f[6:-4]) for f in os.listdir(prefix) if os.path.isfile(os.path.join(prefix, f)) and os.path.splitext(f)[1] == '.pth' and 'best' not in f ] all_best_idx = [ int(f[11:-4]) for f in os.listdir(prefix) if os.path.isfile(os.path.join(prefix, f)) and os.path.splitext(f)[1] == '.pth' and 'best' in f ] if all_best_idx: model_frame_idx = max(all_best_idx) model_name = 'best_frame_{}.pth'.format(model_frame_idx) else: model_frame_idx = max(all_idx) model_name = 'frame_{}.pth'.format(model_frame_idx) model_path = os.path.join(prefix, model_name) logger.log('model loaded from ' + model_path) model.features.load_state_dict(torch.load(model_path)) num_episodes = test_config['num_episodes'] max_frames_per_episode = test_config['max_frames_per_episode'] all_rewards = [] episode_reward = 0 seed = random.randint(0, sys.maxsize) logger.log('reseting env with seed', seed) env.seed(seed) state = env.reset() start_time = time.time() if training_config['use_async_env']: # Create an environment in a separate process, run asychronously async_env = AsyncEnv(env_id, result_path=prefix, draw=training_config['show_game'], record=training_config['record_game'], save_frames=test_config['save_frames'], env_params=env_params, seed=args.seed) episode_idx = 1 this_episode_frame = 1 if certify: certified = 0 if dtype in UINTS: state_max = 1.0 state_min = 0.0 else: state_max = float('inf') state_min = float('-inf') for frame_idx in range(1, num_episodes * max_frames_per_episode + 1): state_tensor = torch.from_numpy( np.ascontiguousarray(state)).unsqueeze(0).cuda().to(torch.float32) # Normalize input pixel to 0-1 if dtype in UINTS: state_tensor /= 255 if test_config['attack']: attack_config['params']['robust_model'] = certify state_tensor = attack(model, state_tensor, attack_config) if certify: beta = training_config.get('convex_final_beta', 0) eps = attack_config['params']['epsilon'] if env_id == 'Acrobot-v1': eps_v = get_acrobot_eps(eps) if USE_CUDA: eps_v = eps_v.cuda() else: eps_v = eps state_ub = torch.clamp(state_tensor + eps_v, max=state_max) state_lb = torch.clamp(state_tensor - eps_v, min=state_min) action = model.act(state_tensor)[0] if certify: max_logit = torch.tensor([action]) c = torch.eye(model.num_actions).type_as( state_tensor)[max_logit].unsqueeze(1) - torch.eye( model.num_actions).type_as(state_tensor).unsqueeze(0) I = (~(max_logit.data.unsqueeze(1) == torch.arange( model.num_actions).type_as(max_logit.data).unsqueeze(0))) c = (c[I].view(state_tensor.size(0), model.num_actions - 1, model.num_actions)) logits_diff_lb = get_logits_lower_bound(model, state_tensor, state_ub, state_lb, eps_v, c, beta) if torch.min(logits_diff_lb[0], 0)[0].data.cpu().numpy() > 0: certified += 1 if training_config['use_async_env']: async_env.async_step(action) next_state, reward, done, _ = async_env.wait_step() else: next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward if frame_idx % test_config['print_frame'] == 0: logger.log( '\ntotal frame {}/{}, episode {}/{}, episode frame{}/{}, latest episode reward: {:.6g}, avg 10 episode reward: {:.6g}' .format( frame_idx, num_episodes * max_frames_per_episode, episode_idx, num_episodes, this_episode_frame, max_frames_per_episode, all_rewards[-1] if all_rewards else np.nan, np.average(all_rewards[:-11:-1]) if all_rewards else np.nan)) if certify: logger.log( 'certified action: {}, certified action ratio: {:.6g}'. format(certified, certified * 1.0 / frame_idx)) if this_episode_frame == max_frames_per_episode: logger.log( 'maximum number of frames reached in this episode, reset environment!' ) done = True if training_config['use_async_env']: async_env.epi_reward = 0 if done: logger.log('reseting env with seed', seed) if training_config['use_async_env']: state = async_env.reset() else: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 this_episode_frame = 1 episode_idx += 1 if episode_idx > num_episodes: break else: this_episode_frame += 1 logger.log('\navg reward' + (' and avg certify:' if certify else ':')) logger.log(np.mean(all_rewards), '+-', np.std(all_rewards)) if certify: logger.log(certified * 1.0 / frame_idx)
else: raise ValueError('invalid environment') # more general settings if args.env.lower() in ['cartpole', 'acrobot']: model = CategoricalDQN lr = 4e-4 replay_buffer_size = 10000 update_target_every = 200 state_space = env.observation_space.shape[0] else: model = CategoricalCnnDQN lr = 2e-4 replay_buffer_size = 100000 update_target_every = 1000 env = wrap_pytorch(wrap_deepmind(env)) state_space = env.observation_space.shape args = to_attr(args_dict) # setup loss function if 'kl' in args.loss.lower(): loss_fn = KL(args) elif 'wasserstein' in args.loss.lower(): loss_fn = Wasserstein(args) elif 'cramer' in args.loss.lower(): loss_fn = Cramer(args) # initialize replay buffer replay_buffer = ReplayBuffer(replay_buffer_size) logger = Logger(args.base_dir)
def train_atari_lstm(**kwargs): random.seed(3) mem_capacity = kwargs['mem_capacity'] batch = kwargs['batch'] lr = kwargs['lr'] double_dqn = kwargs['double_dqn'] gamma = kwargs['gamma'] num_steps = kwargs['num_steps'] target_update_freq = kwargs['target_update_freq'] learn_start = kwargs['learn_start'] plot_update_freq = kwargs['plot_update_freq'] eval_freq = kwargs['eval_freq'] eval_episodes = kwargs['eval_episodes'] eps_decay = kwargs['eps_decay'] eps_end = kwargs['eps_end'] inner_linear_dim = kwargs['inner_linear_dim'] hidden_dim = kwargs['hidden_dim'] lstm_layers = kwargs['lstm_layers'] l1_regularization = kwargs['l1_regularization'] dropout = kwargs['dropout'] is_visdom = kwargs['is_visdom'] write_mode = kwargs['write_mode'] traj_len = kwargs['traj_len'] is_rnn = kwargs['is_rnn'] flickering_p = kwargs['flickering_p'] # is_flickering = kwargs['is_flickering'] if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') env_id = "PongNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) eval_env = make_atari(env_id) eval_env = wrap_deepmind(eval_env) eval_env = wrap_pytorch(eval_env) # env = gameEnv(size=grid_dim, startDelay=num_of_obj, maxSteps=maxSteps - 2) # eval_env = gameEnv(size=grid_dim, startDelay=num_of_obj, maxSteps=maxSteps - 2) # input_size = env.observation_space.n input_size = env.observation_space.shape output_size = env.action_space.n Transition = namedtuple( 'Transition', ('state', 'action', 'reward', 'next_state', 'done', 'pad_mask')) def pad_episode(episode_transitions): zero_transition = Transition(np.zeros(episode_transitions[0][0].shape), 0, 0, np.zeros(episode_transitions[0][0].shape), 0, 0) for i in range(traj_len - len(episode_transitions)): episode_transitions.append(zero_transition) return episode_transitions f = open(kwargs['output_path'], write_mode) network = DRQN_atari(input_size, output_size, inner_linear_dim, hidden_dim, lstm_layers, batch, traj_len, seed=3, device=device, is_rnn=is_rnn).to(device) target_network = DRQN_atari(input_size, output_size, inner_linear_dim, hidden_dim, lstm_layers, batch, traj_len, seed=3, device=device, is_rnn=is_rnn).to(device) target_network.load_state_dict(network.state_dict()) # using pretrained models # network.load_state_dict(torch.load('drqn_12.202898550706951')) # target_network.load_state_dict(torch.load('drqn_12.202898550706951')) memory = ReplayBuffer(mem_capacity, batch) optimizer = optim.Adam(network.parameters(), lr=lr) average_rewards = [] avg_rew_steps = [] losses = [] losses_steps = [] episode_transitions = [] done = True traj_steps_cnt = 0 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 30000 epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) for step in range(num_steps): if done or traj_steps_cnt % traj_len == 0: traj_steps_cnt = 0 if len(episode_transitions) > 0: episode_transitions = pad_episode(episode_transitions) memory.add_episode(episode_transitions) episode_transitions = [] if done: state = env.reset() network.hidden = network.init_hidden() traj_steps_cnt += 1 # old epsilon # eps = max((eps_decay - step + learn_start) / eps_decay, eps_end) # new epsilon eps = epsilon_by_frame(step) if random.random() > eps: q_value = network( Variable(torch.FloatTensor( np.float32(state)).unsqueeze(0).to(device), volatile=True)) q_value = q_value.view(-1, output_size).cpu().detach().numpy() action = np.argmax(q_value) else: action = random.randrange(env.action_space.n) next_state, reward, done, _ = env.step(action) # with a chosen probability, screen is fully obscured (following the paper: https://arxiv.org/pdf/1507.06527.pdf) if decision(flickering_p): next_state = np.zeros(next_state.shape) # after we made a step render it to visualize if is_visdom: env.render() # update plots # if env.done and step % plot_update_freq == 0 and is_visdom: # env.updatePlots(is_learn_start=(step > learn_start)) # Done due to timeout is a non-markovian property. This is an artifact which we would not like to learn from. # if not (done and reward < 0): # memory.add(state, action, reward, next_state, not done) episode_transitions.append( Transition(state, action, reward, next_state, not done, 1)) # Todo - done or not done state = next_state # save the current hidden vector to restore it after training step so_far_hidden = network.clone_hidden() # train part if step > learn_start: # TODO - is it better to save the hidden vec too in the beggining of each traj, or maybe it's wrong since the weights are changing network.batch_hidden = network.init_batch_hidden() target_network.batch_hidden = target_network.init_batch_hidden() optimizer.zero_grad() batch_state, batch_action, batch_reward, batch_next_state, not_done_mask, is_pad_mask = memory.sample_episode( ) batch_state = Variable( torch.FloatTensor(np.float32(batch_state)).to(device)) batch_next_state = Variable(torch.FloatTensor( np.float32(batch_next_state)).to(device), volatile=True) batch_action = torch.tensor(batch_action, dtype=torch.int64).view( batch * traj_len, -1).to(device) batch_reward = torch.tensor(batch_reward, dtype=torch.float32).view( batch * traj_len, -1).to(device) not_done_mask = torch.tensor(not_done_mask, dtype=torch.float32).view( batch * traj_len, -1).to(device) is_pad_mask = torch.tensor(is_pad_mask, dtype=torch.float32).view( batch * traj_len, -1).to(device) # current_Q = network.forward_batch(batch_state).view(-1,4).gather(1, batch_action) * is_pad_mask current_Q = network.forward(batch_state).view( -1, output_size).gather(1, batch_action) * is_pad_mask # current_Q = network(batch_state).view(batch,-1).gather(1, batch_action) * is_pad_mask with torch.no_grad(): if double_dqn: next_state_actions = network(batch_next_state).max( 1, keepdim=True)[1] next_Q = target_network(batch_next_state).gather( 1, next_state_actions) else: next_Q = target_network.forward(batch_next_state).view( -1, output_size).max(1, keepdim=True)[0] target_Q = batch_reward + ( gamma * next_Q) * not_done_mask * is_pad_mask # loss = F.smooth_l1_loss(current_Q, target_Q) loss = (current_Q - target_Q).pow(2).mean() # all_params = torch.cat([x.view(-1) for x in model.parameters()]) all_params = torch.cat([x.view(-1) for x in network.parameters()]) # loss += l1_regularization * torch.norm(all_params, 1) #TODO do we want to clamp like this, maybe the intersting info is above abs(1) so we need to use tanh or etc. # loss = torch.clamp(loss, min=-1, max=1) if step % plot_update_freq == 0: print('loss is: %f' % loss) loss.backward() # found as helpful to limit max grad values # for param in network.parameters(): # param.grad.data.clamp_(-1, 1) optimizer.step() losses.append(loss.item()) losses_steps.append(step) # # plot losses # plt.figure(4) # plt.plot(losses_steps,losses) # plt.title("Losses") # env.vis.matplot(plt,win=4) # after training session we restore the hidden vector values network.hidden = so_far_hidden if step % target_update_freq == 0: # print('target network update') target_network.load_state_dict(network.state_dict()) # TODO - adapt to atary code if step % eval_freq == 0 and step > learn_start: network.eval() # save the current hidden vector to restore it after training step so_far_hidden = network.clone_hidden() total_reward = 0 for eval_ep in range(eval_episodes): network.hidden = network.init_hidden() eval_state = eval_env.reset() while True: # if is_visdom: eval_env.render() # action = network(state).max(1)[1].item() q_value = network( Variable(torch.FloatTensor( np.float32(eval_state)).unsqueeze(0).to(device), volatile=True)) q_value = q_value.view(-1, output_size).cpu().detach().numpy() action = np.argmax(q_value) if random.random() < 0.01: action = random.randrange(output_size) eval_state, reward, done, _ = eval_env.step(action) total_reward += reward if done: break network.train() # after evaluation session we restore the hidden vector values network.hidden = so_far_hidden average_reward = total_reward * 1.0 / eval_episodes average_rewards.append(average_reward) avg_rew_steps.append(step) print('Step: ' + str(step) + ' Avg reward: ' + str(average_reward)) f.write('Step: ' + str(step) + ' Avg reward: ' + str(average_reward) + '\n') # if step > learn_start and len(losses) > 0 and len(average_rewards) > 0 and step % 1000 == 0: # clear_output() # pl.plot(losses_steps, losses) # pl.title('Loss') # pl.show() # pl.plot(avg_rew_steps, average_rewards) # pl.title('Reward') # pl.show() tot_avg_reward = sum(average_rewards) / (float(len(average_rewards)) + 0.0000000001) print('Run average reward: ' + str(tot_avg_reward)) f.write('Run average reward: ' + str(tot_avg_reward) + '\n') f.close() model_path = "model:_lr_{:f}_batch_size:_{:f}_trajectory_length:_{:f}_flickering_p_{:f}_is_rnn:_{:s}".format( kwargs['lr'], kwargs['batch'], kwargs['traj_len'], kwargs['flickering_p'], str(kwargs['is_rnn'])) torch.save(network.state_dict(), model_path + str(tot_avg_reward)) return tot_avg_reward