Example #1
0
def create_super_mario_env_stage1(name='SuperMarioBrosRandomStage1-v1'):
    import gym
    from nes_py.wrappers import JoypadSpace
    from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT

    import gym_super_mario_bros
    stage_names = [
        'SuperMarioBros-1-1-v1',
        'SuperMarioBros-1-2-v1',
        'SuperMarioBros-1-3-v1',
        'SuperMarioBros-1-4-v1',
    ]

    env = gym_super_mario_bros.make(stage_names[1])
    env = JoypadSpace(env, SIMPLE_MOVEMENT)

    env = wrappers.MaxAndSkipEnv(env, skip=4)
    env = wrappers.wrap_deepmind(env,
                                 episode_life=False,
                                 clip_rewards=False,
                                 frame_stack=True,
                                 scale=True)
    #env = wrappers.AllowBacktracking(env)

    return env
Example #2
0
def mini_test(model, config, logger, dtype, num_episodes=10, max_frames_per_episode=30000):
    logger.log('start mini test')
    training_config = config['training_config']
    env_params = training_config['env_params']
    env_params['clip_rewards'] = False
    env_params['episode_life'] = False
    env_id = config['env_id']

    if 'NoFrameskip' not in env_id:
        env = make_atari_cart(env_id)
    else:
        env = make_atari(env_id)
        env = wrap_deepmind(env, **env_params)
        env = wrap_pytorch(env)
    state = env.reset()
    all_rewards = []
    episode_reward = 0

    seed = random.randint(0, sys.maxsize)
    logger.log('reseting env with seed', seed)
    env.seed(seed)
    state = env.reset()

    episode_idx = 1
    this_episode_frame = 1
    for frame_idx in range(1, num_episodes * max_frames_per_episode + 1):
        state_tensor = torch.from_numpy(np.ascontiguousarray(state)).unsqueeze(0).cuda().to(torch.float32)
        if dtype in UINTS:
            state_tensor /= 255
        action = model.act(state_tensor)[0]
        next_state, reward, done, _ = env.step(action)

        # logger.log(action)
        state = next_state
        episode_reward += reward
        if this_episode_frame == max_frames_per_episode:
            logger.log('maximum number of frames reached in this episode, reset environment!')
            done = True

        if done:
            logger.log('reseting env with seed', seed)
            state = env.reset()
            all_rewards.append(episode_reward)
            logger.log('episode {}/{} reward: {:6g}'.format(episode_idx, num_episodes, all_rewards[-1]))
            episode_reward = 0
            this_episode_frame = 1
            episode_idx += 1
            if episode_idx > num_episodes:
                break
        else:
            this_episode_frame += 1
    return np.mean(all_rewards)
Example #3
0
def create_super_mario_env(name='SuperMarioBros-v1'):
    import gym
    from nes_py.wrappers import JoypadSpace
    from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
    import gym_super_mario_bros
    env = gym_super_mario_bros.make(name)
    env = JoypadSpace(env, SIMPLE_MOVEMENT)

    env = wrappers.MaxAndSkipEnv(env, skip=4)
    env = wrappers.wrap_deepmind(env,
                                 episode_life=False,
                                 clip_rewards=False,
                                 frame_stack=True,
                                 scale=True)
    return env
Example #4
0
def worker_initializer(env_id, env_params, seed, save_frames=False):
    from common.wrappers import make_atari, make_atari_cart, wrap_deepmind, wrap_pytorch
    from setproctitle import setproctitle
    global env, return_unprocessed
    return_unprocessed = save_frames
    setproctitle('atari-env')
    if "NoFrameskip" not in env_id:
        env = make_atari_cart(env_id)
    else:
        env = make_atari(env_id)
        env = wrap_deepmind(env, **env_params)
        env = wrap_pytorch(env)
    random.seed(seed)
    seed = random.randint(0, sys.maxsize)
    print('reseting env with seed', seed, 'in initializer')
    env.seed(seed)
    state = env.reset()
    env.seed(seed)
    print('state shape', state.shape)
Example #5
0
    config.win_break = True
    config.prioritized_replay = args.prioritized_replay
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_eps = 1e-6

    if args.env == 'PongNoFrameskip-v4':
        config.win_reward = 17
    elif args.env == 'BreakoutNoFrameskip-v4':
        config.win_reward = 200
    elif args.env == 'BoxingNoFrameskip-v4':
        config.win_reward = 200

    # handle the atari env
    env = make_atari(config.env)
    env = wrap_deepmind(env)
    env = wrap_pytorch(env)

    config.action_dim = env.action_space.n
    config.state_shape = env.observation_space.shape
    agent = CnnDDQNAgent(config)

    if args.train:
        trainer = Trainer(agent, env, config)
        trainer.train()

    elif args.test:
        if args.model_path is None:
            print('please add the model path:', '--model_path xxxx')
            exit(0)
        tester = Tester(agent, env, args.model_path)
Example #6
0
import torch.nn.functional as F

import matplotlib.pyplot as plt

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using: ', device)

# env = gym.envs.make("BreakoutNoFrameskip-v4")

from common.wrappers import make_atari, wrap_deepmind, wrap_pytorch
from collections import deque

env_id = "BreakoutNoFrameskip-v4"
env = make_atari(env_id)
env = wrap_deepmind(env, frame_stack=True)
env = wrap_pytorch(env)


# Replay Buffer
class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)

        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
Example #7
0
 def _thunk():
     # env = gym.make(env_name)
     env = make_atari(env_name)
     env = wrap_deepmind(env, frame_stack=True)
     env = wrap_pytorch(env)
     return env
Example #8
0
def make_env():
    def _thunk():
        # env = gym.make(env_name)
        env = make_atari(env_name)
        env = wrap_deepmind(env, frame_stack=True)
        env = wrap_pytorch(env)
        return env

    return _thunk


envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env2 = make_atari(env_name)
env2 = wrap_deepmind(env2, frame_stack=True)
env2 = wrap_pytorch(env2)


class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()

        self.features = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4), nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU())

        self.critic = nn.Sequential(nn.Linear(3136, hidden_size), nn.ReLU(),
                                    nn.Linear(hidden_size, 1))
Example #9
0
def main(args):
    config = load_config(args)
    prefix = config['env_id']
    training_config = config['training_config']
    if config['name_suffix']:
        prefix += config['name_suffix']
    if config['path_prefix']:
        prefix = os.path.join(config['path_prefix'], prefix)
    if not os.path.exists(prefix):
        os.makedirs(prefix)

    train_log = os.path.join(prefix, 'train.log')
    logger = Logger(open(train_log, "w"))
    logger.log('Command line:', " ".join(sys.argv[:]))
    logger.log(args)
    logger.log(config)

    env_params = training_config['env_params']
    env_id = config['env_id']
    if "NoFrameskip" not in env_id:
        env = make_atari_cart(env_id)
    else:
        env = make_atari(env_id)
        env = wrap_deepmind(env, **env_params)
        env = wrap_pytorch(env)

    seed = training_config['seed']
    env.seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)

    state = env.reset()
    dtype = state.dtype
    logger.log("env_shape: {}, num of actions: {}".format(
        env.observation_space.shape, env.action_space.n))
    if "NoFrameskip" in env_id:
        logger.log('action meaning:',
                   env.unwrapped.get_action_meanings()[:env.action_space.n])

    robust = training_config.get('robust', False)
    adv_train = training_config.get('adv_train', False)
    bound_solver = training_config.get('bound_solver', 'cov')
    attack_config = {}
    if adv_train or bound_solver == 'pgd':
        test_config = config['test_config']
        attack_config = training_config["attack_config"]
        adv_ratio = training_config.get('adv_ratio', 1)
        if adv_train:
            logger.log('using adversarial examples for training, adv ratio:',
                       adv_ratio)
        else:
            logger.log('using pgd regularization training')
    if robust or adv_train:
        schedule_start = training_config['schedule_start']
        schedule_length = training_config['schedule_length']
        starting_epsilon = training_config['start_epsilon']
        end_epsilon = training_config['epsilon']
        epsilon_scheduler = EpsilonScheduler(
            training_config.get("schedule_type", "linear"), schedule_start,
            schedule_start + schedule_length - 1, starting_epsilon,
            end_epsilon, 1)
        max_eps = end_epsilon

    model_width = training_config['model_width']
    robust_model = robust and bound_solver != 'pgd'
    dueling = training_config.get('dueling', True)

    current_model = model_setup(env_id, env, robust_model, logger, USE_CUDA,
                                dueling, model_width)
    target_model = model_setup(env_id, env, robust_model, logger, USE_CUDA,
                               dueling, model_width)

    load_path = training_config["load_model_path"]
    if load_path != "" and os.path.exists(load_path):
        load_frame = int(re.findall('^.*frame_([0-9]+).pth$', load_path)[0])
        logger.log('\ntrain from model {}, current frame index is {}\n'.format(
            load_path, load_frame))
        current_model.features.load_state_dict(torch.load(load_path))
        target_model.features.load_state_dict(torch.load(load_path))
    else:
        logger.log('\ntrain from scratch')
        load_frame = 1

    lr = training_config['lr']
    grad_clip = training_config['grad_clip']
    natural_loss_fn = training_config['natural_loss_fn']
    optimizer = optim.Adam(current_model.parameters(),
                           lr=lr,
                           eps=training_config['adam_eps'])
    # Do not evaluate gradient for target model.
    for param in target_model.features.parameters():
        param.requires_grad = False

    buffer_config = training_config['buffer_params']
    replay_initial = buffer_config['replay_initial']
    buffer_capacity = buffer_config['buffer_capacity']
    use_cpp_buffer = training_config["cpprb"]
    use_async_rb = training_config['use_async_rb']
    num_frames = training_config['num_frames']
    batch_size = training_config['batch_size']
    gamma = training_config['gamma']

    if use_cpp_buffer:
        logger.log('using cpp replay buffer')
        if use_async_rb:
            replay_buffer_ctor = AsyncReplayBuffer(initial_state=state,
                                                   batch_size=batch_size)
        else:
            replay_buffer_ctor = cpprb.PrioritizedReplayBuffer
    else:
        logger.log('using python replay buffer')
    per = training_config['per']

    if per:
        logger.log('using prioritized experience replay.')
        alpha = buffer_config['alpha']
        buffer_beta_start = buffer_config['buffer_beta_start']
        buffer_beta_frames = buffer_config.get('buffer_beta_frames', -1)
        if buffer_beta_frames < replay_initial:
            buffer_beta_frames = num_frames - replay_initial
            logger.log('beffer_beta_frames reset to ', buffer_beta_frames)
        buffer_beta_scheduler = BufferBetaScheduler(buffer_beta_start,
                                                    buffer_beta_frames,
                                                    start_frame=replay_initial)
        if use_cpp_buffer:
            replay_buffer = replay_buffer_ctor(
                size=buffer_capacity,
                # env_dict={"obs": {"shape": state.shape, "dtype": np.uint8},
                env_dict={
                    "obs": {
                        "shape": state.shape,
                        "dtype": dtype
                    },
                    "act": {
                        "shape": 1,
                        "dtype": np.uint8
                    },
                    "rew": {},
                    # "next_obs": {"shape": state.shape, "dtype": np.uint8},
                    "next_obs": {
                        "shape": state.shape,
                        "dtype": dtype
                    },
                    "done": {}
                },
                alpha=alpha,
                eps=0.0)  # We add eps manually in training loop
        else:
            replay_buffer = PrioritizedReplayBuffer(buffer_capacity,
                                                    alpha=alpha)

    else:
        logger.log('using regular replay.')
        if use_cpp_buffer:
            replay_buffer = cpprb.ReplayBuffer(
                buffer_capacity,
                # {"obs": {"shape": state.shape, "dtype": np.uint8},
                {
                    "obs": {
                        "shape": state.shape,
                        "dtype": dtype
                    },
                    "act": {
                        "shape": 1,
                        "dtype": np.uint8
                    },
                    "rew": {},
                    # "next_obs": {"shape": state.shape, "dtype": np.uint8},
                    "next_obs": {
                        "shape": state.shape,
                        "dtype": dtype
                    },
                    "done": {}
                })
        else:
            replay_buffer = ReplayBuffer(buffer_capacity)

    update_target(current_model, target_model)

    act_epsilon_start = training_config['act_epsilon_start']
    act_epsilon_final = training_config['act_epsilon_final']
    act_epsilon_decay = training_config['act_epsilon_decay']
    act_epsilon_method = training_config['act_epsilon_method']
    if training_config.get('act_epsilon_decay_zero', True):
        decay_zero = num_frames
    else:
        decay_zero = None
    act_epsilon_scheduler = ActEpsilonScheduler(act_epsilon_start,
                                                act_epsilon_final,
                                                act_epsilon_decay,
                                                method=act_epsilon_method,
                                                start_frame=replay_initial,
                                                decay_zero=decay_zero)

    # Use optimized cuda memory management
    memory_mgr = CudaTensorManager(state.shape,
                                   batch_size,
                                   per,
                                   USE_CUDA,
                                   dtype=dtype)

    losses = []
    td_losses = []
    batch_cur_q = []
    batch_exp_q = []

    sa = None
    kappa = None
    hinge = False
    if robust:
        logger.log(
            'using convex relaxation certified classification loss as a regularization!'
        )
        kappa = training_config['kappa']
        reg_losses = []
        sa = np.zeros(
            (current_model.num_actions, current_model.num_actions - 1),
            dtype=np.int32)
        for i in range(sa.shape[0]):
            for j in range(sa.shape[1]):
                if j < i:
                    sa[i][j] = j
                else:
                    sa[i][j] = j + 1
        sa = torch.LongTensor(sa)
        hinge = training_config.get('hinge', False)
        logger.log('using hinge loss (default is cross entropy): ', hinge)

    if training_config['use_async_env']:
        # Create an environment in a separate process, run asychronously
        async_env = AsyncEnv(env_id,
                             result_path=prefix,
                             draw=training_config['show_game'],
                             record=training_config['record_game'],
                             env_params=env_params,
                             seed=seed)

    # initialize parameters in logging
    all_rewards = []
    episode_reward = 0
    act_epsilon = np.nan
    grad_norm = np.nan
    weights_norm = np.nan
    best_test_reward = -float('inf')
    buffer_stored_size = 0
    if adv_train:
        attack_count = 0
        suc_count = 0
    if robust and bound_solver == 'pgd':
        ori_margin, adv_margin = np.nan, np.nan

    start_time = time.time()
    period_start_time = time.time()

    # Main Loop
    for frame_idx in range(load_frame, num_frames + 1):
        # Step 1: get current action
        frame_start = time.time()
        t = time.time()

        eps = 0
        if adv_train or robust:
            eps = epsilon_scheduler.get_eps(frame_idx, 0)

        act_epsilon = act_epsilon_scheduler.get(frame_idx)
        if adv_train and eps != np.nan and eps >= np.finfo(np.float32).tiny:
            ori_state_tensor = torch.from_numpy(
                np.ascontiguousarray(state)).unsqueeze(0).cuda().to(
                    torch.float32)
            if dtype in UINTS:
                ori_state_tensor /= 255
            attack_config['params']['epsilon'] = eps
            if random.random() < adv_ratio:
                attack_count += 1
                state_tensor = attack(current_model, ori_state_tensor,
                                      attack_config)
                if current_model.act(state_tensor)[0] != current_model.act(
                        ori_state_tensor)[0]:
                    suc_count += 1
            else:
                state_tensor = ori_state_tensor
            action = current_model.act(state_tensor, act_epsilon)[0]
        else:
            with torch.no_grad():
                state_tensor = torch.from_numpy(
                    np.ascontiguousarray(state)).unsqueeze(0).cuda().to(
                        torch.float32)
                if dtype in UINTS:
                    state_tensor /= 255
                ori_state_tensor = torch.clone(state_tensor)
                action = current_model.act(state_tensor, act_epsilon)[0]

        # torch.cuda.synchronize()
        log_time('act_time', time.time() - t)

        # Step 2: run environment
        t = time.time()
        if training_config['use_async_env']:
            async_env.async_step(action)
        else:
            next_state, reward, done, _ = env.step(action)
        log_time('env_time', time.time() - t)

        # Step 3: save to buffer
        # For asynchronous env, defer saving
        if not training_config['use_async_env']:
            t = time.time()
            if use_cpp_buffer:
                replay_buffer.add(obs=state,
                                  act=action,
                                  rew=reward,
                                  next_obs=next_state,
                                  done=done)
            else:
                replay_buffer.push(state, action, reward, next_state, done)
            log_time('save_time', time.time() - t)

        if use_cpp_buffer:
            buffer_stored_size = replay_buffer.get_stored_size()
        else:
            buffer_stored_size = len(replay_buffer)

        beta = np.nan
        buffer_beta = np.nan
        t = time.time()

        if buffer_stored_size > replay_initial:
            if training_config['per']:
                buffer_beta = buffer_beta_scheduler.get(frame_idx)
            if robust:
                convex_final_beta = training_config['convex_final_beta']
                convex_start_beta = training_config['convex_start_beta']
                beta = (
                    max_eps - eps *
                    (1.0 - convex_final_beta)) / max_eps * convex_start_beta

            res = compute_td_loss(current_model,
                                  target_model,
                                  batch_size,
                                  replay_buffer,
                                  per,
                                  use_cpp_buffer,
                                  use_async_rb,
                                  optimizer,
                                  gamma,
                                  memory_mgr,
                                  robust,
                                  buffer_beta=buffer_beta,
                                  grad_clip=grad_clip,
                                  natural_loss_fn=natural_loss_fn,
                                  eps=eps,
                                  beta=beta,
                                  sa=sa,
                                  kappa=kappa,
                                  dtype=dtype,
                                  hinge=hinge,
                                  hinge_c=training_config.get('hinge_c', 1),
                                  env_id=env_id,
                                  bound_solver=bound_solver,
                                  attack_config=attack_config)
            loss, grad_norm, weights_norm, td_loss, batch_cur_q_value, batch_exp_q_value = res[
                0], res[1], res[2], res[3], res[4], res[5]
            if robust:
                reg_loss = res[-1]
                reg_losses.append(reg_loss.data.item())
                if bound_solver == 'pgd':
                    ori_margin, adv_margin = res[-3].data.item(
                    ), res[-2].data.item()

            losses.append(loss.data.item())
            td_losses.append(td_loss.data.item())
            batch_cur_q.append(batch_cur_q_value.data.item())
            batch_exp_q.append(batch_exp_q_value.data.item())

        log_time('loss_time', time.time() - t)

        # Step 2: run environment (async)
        t = time.time()
        if training_config['use_async_env']:
            next_state, reward, done, _ = async_env.wait_step()
        log_time('env_time', time.time() - t)

        # Step 3: save to buffer (async)
        if training_config['use_async_env']:
            t = time.time()
            if use_cpp_buffer:
                replay_buffer.add(obs=state,
                                  act=action,
                                  rew=reward,
                                  next_obs=next_state,
                                  done=done)
            else:
                replay_buffer.push(state, action, reward, next_state, done)
            log_time('save_time', time.time() - t)

        # Update states and reward
        t = time.time()
        state = next_state
        episode_reward += reward
        if done:
            if training_config['use_async_env']:
                state = async_env.reset()
            else:
                state = env.reset()
            all_rewards.append(episode_reward)
            episode_reward = 0
        log_time('env_time', time.time() - t)

        # All kinds of result logging
        if frame_idx % training_config[
                'print_frame'] == 0 or frame_idx == num_frames or (
                    robust and abs(frame_idx - schedule_start) < 5
                ) or abs(buffer_stored_size - replay_initial) < 5:
            logger.log(
                '\nframe {}/{}, learning rate: {:.6g}, buffer beta: {:.6g}, action epsilon: {:.6g}'
                .format(frame_idx, num_frames, lr, buffer_beta, act_epsilon))
            logger.log(
                'total time: {:.2f}, epoch time: {:.4f}, speed: {:.2f} frames/sec, last total loss: {:.6g}, avg total loss: {:.6g}, grad norm: {:.6g}, weights_norm: {:.6g}, latest episode reward: {:.6g}, avg 10 episode reward: {:.6g}'
                .format(
                    time.time() - start_time,
                    time.time() - period_start_time,
                    training_config['print_frame'] /
                    (time.time() - period_start_time),
                    losses[-1] if losses else np.nan,
                    np.average(losses[:-training_config['print_frame'] -
                                      1:-1]) if losses else np.nan, grad_norm,
                    weights_norm, all_rewards[-1] if all_rewards else np.nan,
                    np.average(all_rewards[:-11:-1])
                    if all_rewards else np.nan))
            logger.log('last td loss: {:.6g}, avg td loss: {:.6g}'.format(
                td_losses[-1] if td_losses else np.nan,
                np.average(td_losses[:-training_config['print_frame'] -
                                     1:-1]) if td_losses else np.nan))
            logger.log(
                'last batch cur q: {:.6g}, avg batch cur q: {:.6g}'.format(
                    batch_cur_q[-1] if batch_cur_q else np.nan,
                    np.average(batch_cur_q[:-training_config['print_frame'] -
                                           1:-1]) if batch_cur_q else np.nan))
            logger.log(
                'last batch exp q: {:.6g}, avg batch exp q: {:.6g}'.format(
                    batch_exp_q[-1] if batch_exp_q else np.nan,
                    np.average(batch_exp_q[:-training_config['print_frame'] -
                                           1:-1]) if batch_exp_q else np.nan))
            if robust:
                logger.log('current input epsilon: {:.6g}'.format(eps))
                if bound_solver == 'pgd':
                    logger.log(
                        'last logit margin: ori: {:.6g}, adv: {:.6g}'.format(
                            ori_margin, adv_margin))
                else:
                    logger.log('current bound beta: {:.6g}'.format(beta))
                logger.log(
                    'last cert reg loss: {:.6g}, avg cert reg loss: {:.6g}'.
                    format(
                        reg_losses[-1] if reg_losses else np.nan,
                        np.average(
                            reg_losses[:-training_config['print_frame'] -
                                       1:-1]) if reg_losses else np.nan))
                logger.log('current kappa: {:.6g}'.format(kappa))
            if adv_train:
                logger.log(
                    'current attack epsilon (same as input epsilon): {:.6g}'.
                    format(eps))
                diff = ori_state_tensor - state_tensor
                diff = np.abs(diff.data.cpu().numpy())
                logger.log('current Linf distortion: {:.6g}'.format(
                    np.max(diff)))
                logger.log(
                    'this batch attacked: {}, success: {}, attack success rate: {:.6g}'
                    .format(
                        attack_count, suc_count, suc_count * 1.0 /
                        attack_count if attack_count > 0 else np.nan))
                attack_count = 0
                suc_count = 0
                logger.log('attack stats reseted.')

            period_start_time = time.time()
            log_time.print()
            log_time.clear()

        if frame_idx % training_config[
                'save_frame'] == 0 or frame_idx == num_frames:
            plot(frame_idx, all_rewards, losses, prefix)
            torch.save(current_model.features.state_dict(),
                       '{}/frame_{}.pth'.format(prefix, frame_idx))

        if frame_idx % training_config['update_target_frame'] == 0:
            update_target(current_model, target_model)

        if frame_idx % training_config.get('mini_test', 100000) == 0 and (
            (robust and beta == 0) or
            (not robust and frame_idx * 1.0 / num_frames >= 0.8)):
            test_reward = mini_test(current_model, config, logger, dtype)
            logger.log('this test avg reward: {:6g}'.format(test_reward))
            if test_reward >= best_test_reward:
                best_test_reward = test_reward
                logger.log(
                    'new best reward {:6g} achieved, update checkpoint'.format(
                        test_reward))
                torch.save(current_model.features.state_dict(),
                           '{}/best_frame_{}.pth'.format(prefix, frame_idx))

        log_time.log_time('total', time.time() - frame_start)
Example #10
0
def main(args):
    config = load_config(args)
    prefix = config['env_id']
    training_config = config['training_config']
    test_config = config['test_config']
    attack_config = test_config["attack_config"]
    if config['name_suffix']:
        prefix += config['name_suffix']
    if config['path_prefix']:
        prefix = os.path.join(config['path_prefix'], prefix)
    if 'load_model_path' in test_config and os.path.isfile(
            test_config['load_model_path']):
        if not os.path.exists(prefix):
            os.makedirs(prefix)
        test_log = os.path.join(prefix, test_config['log_name'])
    else:
        if os.path.exists(prefix):
            test_log = os.path.join(prefix, test_config['log_name'])
        else:
            raise ValueError(
                'Path {} not exists, please specify test model path.')
    logger = Logger(open(test_log, "w"))
    logger.log('Command line:', " ".join(sys.argv[:]))
    logger.log(args)
    logger.log(config)
    certify = test_config.get('certify', False)
    env_params = training_config['env_params']
    env_params['clip_rewards'] = False
    env_params['episode_life'] = False
    env_id = config['env_id']

    if "NoFrameskip" not in env_id:
        env = make_atari_cart(env_id)
    else:
        env = make_atari(env_id)
        env = wrap_deepmind(env, **env_params)
        env = wrap_pytorch(env)

    state = env.reset()
    dtype = state.dtype
    logger.log("env_shape: {}, num of actions: {}".format(
        env.observation_space.shape, env.action_space.n))

    model_width = training_config['model_width']
    robust_model = certify
    dueling = training_config.get('dueling', True)

    model = model_setup(env_id, env, robust_model, logger, USE_CUDA, dueling,
                        model_width)

    if 'load_model_path' in test_config and os.path.isfile(
            test_config['load_model_path']):
        model_path = test_config['load_model_path']
    else:
        logger.log("choosing the best model from " + prefix)
        all_idx = [
            int(f[6:-4]) for f in os.listdir(prefix)
            if os.path.isfile(os.path.join(prefix, f))
            and os.path.splitext(f)[1] == '.pth' and 'best' not in f
        ]
        all_best_idx = [
            int(f[11:-4]) for f in os.listdir(prefix)
            if os.path.isfile(os.path.join(prefix, f))
            and os.path.splitext(f)[1] == '.pth' and 'best' in f
        ]
        if all_best_idx:
            model_frame_idx = max(all_best_idx)
            model_name = 'best_frame_{}.pth'.format(model_frame_idx)
        else:
            model_frame_idx = max(all_idx)
            model_name = 'frame_{}.pth'.format(model_frame_idx)
        model_path = os.path.join(prefix, model_name)

    logger.log('model loaded from ' + model_path)
    model.features.load_state_dict(torch.load(model_path))
    num_episodes = test_config['num_episodes']
    max_frames_per_episode = test_config['max_frames_per_episode']

    all_rewards = []
    episode_reward = 0

    seed = random.randint(0, sys.maxsize)
    logger.log('reseting env with seed', seed)
    env.seed(seed)
    state = env.reset()
    start_time = time.time()
    if training_config['use_async_env']:
        # Create an environment in a separate process, run asychronously
        async_env = AsyncEnv(env_id,
                             result_path=prefix,
                             draw=training_config['show_game'],
                             record=training_config['record_game'],
                             save_frames=test_config['save_frames'],
                             env_params=env_params,
                             seed=args.seed)

    episode_idx = 1
    this_episode_frame = 1

    if certify:
        certified = 0

    if dtype in UINTS:
        state_max = 1.0
        state_min = 0.0
    else:
        state_max = float('inf')
        state_min = float('-inf')

    for frame_idx in range(1, num_episodes * max_frames_per_episode + 1):

        state_tensor = torch.from_numpy(
            np.ascontiguousarray(state)).unsqueeze(0).cuda().to(torch.float32)
        # Normalize input pixel to 0-1
        if dtype in UINTS:
            state_tensor /= 255

        if test_config['attack']:
            attack_config['params']['robust_model'] = certify
            state_tensor = attack(model, state_tensor, attack_config)

        if certify:
            beta = training_config.get('convex_final_beta', 0)
            eps = attack_config['params']['epsilon']
            if env_id == 'Acrobot-v1':
                eps_v = get_acrobot_eps(eps)
                if USE_CUDA:
                    eps_v = eps_v.cuda()
            else:
                eps_v = eps
            state_ub = torch.clamp(state_tensor + eps_v, max=state_max)
            state_lb = torch.clamp(state_tensor - eps_v, min=state_min)

        action = model.act(state_tensor)[0]

        if certify:
            max_logit = torch.tensor([action])
            c = torch.eye(model.num_actions).type_as(
                state_tensor)[max_logit].unsqueeze(1) - torch.eye(
                    model.num_actions).type_as(state_tensor).unsqueeze(0)
            I = (~(max_logit.data.unsqueeze(1) == torch.arange(
                model.num_actions).type_as(max_logit.data).unsqueeze(0)))
            c = (c[I].view(state_tensor.size(0), model.num_actions - 1,
                           model.num_actions))
            logits_diff_lb = get_logits_lower_bound(model, state_tensor,
                                                    state_ub, state_lb, eps_v,
                                                    c, beta)
            if torch.min(logits_diff_lb[0], 0)[0].data.cpu().numpy() > 0:
                certified += 1

        if training_config['use_async_env']:
            async_env.async_step(action)
            next_state, reward, done, _ = async_env.wait_step()
        else:
            next_state, reward, done, _ = env.step(action)

        state = next_state
        episode_reward += reward

        if frame_idx % test_config['print_frame'] == 0:
            logger.log(
                '\ntotal frame {}/{}, episode {}/{}, episode frame{}/{}, latest episode reward: {:.6g}, avg 10 episode reward: {:.6g}'
                .format(
                    frame_idx, num_episodes * max_frames_per_episode,
                    episode_idx, num_episodes, this_episode_frame,
                    max_frames_per_episode,
                    all_rewards[-1] if all_rewards else np.nan,
                    np.average(all_rewards[:-11:-1])
                    if all_rewards else np.nan))
            if certify:
                logger.log(
                    'certified action: {}, certified action ratio: {:.6g}'.
                    format(certified, certified * 1.0 / frame_idx))

        if this_episode_frame == max_frames_per_episode:
            logger.log(
                'maximum number of frames reached in this episode, reset environment!'
            )
            done = True
            if training_config['use_async_env']:
                async_env.epi_reward = 0

        if done:
            logger.log('reseting env with seed', seed)
            if training_config['use_async_env']:
                state = async_env.reset()
            else:
                state = env.reset()
            all_rewards.append(episode_reward)
            episode_reward = 0
            this_episode_frame = 1
            episode_idx += 1
            if episode_idx > num_episodes:
                break
        else:
            this_episode_frame += 1

    logger.log('\navg reward' + (' and avg certify:' if certify else ':'))
    logger.log(np.mean(all_rewards), '+-', np.std(all_rewards))
    if certify:
        logger.log(certified * 1.0 / frame_idx)
Example #11
0
else:
    raise ValueError('invalid environment')

# more general settings
if args.env.lower() in ['cartpole', 'acrobot']:
    model = CategoricalDQN
    lr = 4e-4
    replay_buffer_size = 10000
    update_target_every = 200
    state_space = env.observation_space.shape[0]
else:
    model = CategoricalCnnDQN
    lr = 2e-4
    replay_buffer_size = 100000
    update_target_every = 1000
    env = wrap_pytorch(wrap_deepmind(env))
    state_space = env.observation_space.shape

args = to_attr(args_dict)

# setup loss function
if 'kl' in args.loss.lower():
    loss_fn = KL(args)
elif 'wasserstein' in args.loss.lower():
    loss_fn = Wasserstein(args)
elif 'cramer' in args.loss.lower():
    loss_fn = Cramer(args)

# initialize replay buffer
replay_buffer = ReplayBuffer(replay_buffer_size)
logger = Logger(args.base_dir)
Example #12
0
def train_atari_lstm(**kwargs):

    random.seed(3)

    mem_capacity = kwargs['mem_capacity']
    batch = kwargs['batch']
    lr = kwargs['lr']
    double_dqn = kwargs['double_dqn']
    gamma = kwargs['gamma']
    num_steps = kwargs['num_steps']
    target_update_freq = kwargs['target_update_freq']
    learn_start = kwargs['learn_start']
    plot_update_freq = kwargs['plot_update_freq']
    eval_freq = kwargs['eval_freq']
    eval_episodes = kwargs['eval_episodes']
    eps_decay = kwargs['eps_decay']
    eps_end = kwargs['eps_end']
    inner_linear_dim = kwargs['inner_linear_dim']
    hidden_dim = kwargs['hidden_dim']
    lstm_layers = kwargs['lstm_layers']
    l1_regularization = kwargs['l1_regularization']
    dropout = kwargs['dropout']
    is_visdom = kwargs['is_visdom']
    write_mode = kwargs['write_mode']
    traj_len = kwargs['traj_len']
    is_rnn = kwargs['is_rnn']
    flickering_p = kwargs['flickering_p']
    # is_flickering = kwargs['is_flickering']

    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    env_id = "PongNoFrameskip-v4"
    env = make_atari(env_id)
    env = wrap_deepmind(env)
    env = wrap_pytorch(env)
    eval_env = make_atari(env_id)
    eval_env = wrap_deepmind(eval_env)
    eval_env = wrap_pytorch(eval_env)

    # env = gameEnv(size=grid_dim, startDelay=num_of_obj, maxSteps=maxSteps - 2)
    # eval_env = gameEnv(size=grid_dim, startDelay=num_of_obj, maxSteps=maxSteps - 2)
    # input_size = env.observation_space.n
    input_size = env.observation_space.shape
    output_size = env.action_space.n

    Transition = namedtuple(
        'Transition',
        ('state', 'action', 'reward', 'next_state', 'done', 'pad_mask'))

    def pad_episode(episode_transitions):

        zero_transition = Transition(np.zeros(episode_transitions[0][0].shape),
                                     0, 0,
                                     np.zeros(episode_transitions[0][0].shape),
                                     0, 0)

        for i in range(traj_len - len(episode_transitions)):
            episode_transitions.append(zero_transition)
        return episode_transitions

    f = open(kwargs['output_path'], write_mode)

    network = DRQN_atari(input_size,
                         output_size,
                         inner_linear_dim,
                         hidden_dim,
                         lstm_layers,
                         batch,
                         traj_len,
                         seed=3,
                         device=device,
                         is_rnn=is_rnn).to(device)
    target_network = DRQN_atari(input_size,
                                output_size,
                                inner_linear_dim,
                                hidden_dim,
                                lstm_layers,
                                batch,
                                traj_len,
                                seed=3,
                                device=device,
                                is_rnn=is_rnn).to(device)
    target_network.load_state_dict(network.state_dict())

    # using pretrained models
    # network.load_state_dict(torch.load('drqn_12.202898550706951'))
    # target_network.load_state_dict(torch.load('drqn_12.202898550706951'))

    memory = ReplayBuffer(mem_capacity, batch)

    optimizer = optim.Adam(network.parameters(), lr=lr)

    average_rewards = []
    avg_rew_steps = []
    losses = []
    losses_steps = []
    episode_transitions = []
    done = True
    traj_steps_cnt = 0
    epsilon_start = 1.0
    epsilon_final = 0.01
    epsilon_decay = 30000
    epsilon_by_frame = lambda frame_idx: epsilon_final + (
        epsilon_start - epsilon_final) * math.exp(-1. * frame_idx /
                                                  epsilon_decay)

    for step in range(num_steps):

        if done or traj_steps_cnt % traj_len == 0:
            traj_steps_cnt = 0
            if len(episode_transitions) > 0:
                episode_transitions = pad_episode(episode_transitions)
                memory.add_episode(episode_transitions)
            episode_transitions = []
            if done:
                state = env.reset()
                network.hidden = network.init_hidden()

        traj_steps_cnt += 1
        # old epsilon
        # eps = max((eps_decay - step + learn_start) / eps_decay, eps_end)
        # new epsilon
        eps = epsilon_by_frame(step)

        if random.random() > eps:
            q_value = network(
                Variable(torch.FloatTensor(
                    np.float32(state)).unsqueeze(0).to(device),
                         volatile=True))
            q_value = q_value.view(-1, output_size).cpu().detach().numpy()
            action = np.argmax(q_value)
        else:
            action = random.randrange(env.action_space.n)

        next_state, reward, done, _ = env.step(action)

        # with a chosen probability, screen is fully obscured (following the paper: https://arxiv.org/pdf/1507.06527.pdf)
        if decision(flickering_p):
            next_state = np.zeros(next_state.shape)

        # after we made a step render it to visualize
        if is_visdom:
            env.render()

        # update plots
        # if env.done and step % plot_update_freq == 0 and is_visdom:
        #     env.updatePlots(is_learn_start=(step > learn_start))

        # Done due to timeout is a non-markovian property. This is an artifact which we would not like to learn from.
        # if not (done and reward < 0):
        # memory.add(state, action, reward, next_state, not done)
        episode_transitions.append(
            Transition(state, action, reward, next_state, not done,
                       1))  # Todo - done or not done

        state = next_state

        # save the current hidden vector to restore it after training step
        so_far_hidden = network.clone_hidden()

        # train part
        if step > learn_start:
            # TODO - is it better to save the hidden vec too in the beggining of each traj, or maybe it's wrong since the weights are changing
            network.batch_hidden = network.init_batch_hidden()
            target_network.batch_hidden = target_network.init_batch_hidden()
            optimizer.zero_grad()

            batch_state, batch_action, batch_reward, batch_next_state, not_done_mask, is_pad_mask = memory.sample_episode(
            )

            batch_state = Variable(
                torch.FloatTensor(np.float32(batch_state)).to(device))
            batch_next_state = Variable(torch.FloatTensor(
                np.float32(batch_next_state)).to(device),
                                        volatile=True)
            batch_action = torch.tensor(batch_action, dtype=torch.int64).view(
                batch * traj_len, -1).to(device)
            batch_reward = torch.tensor(batch_reward,
                                        dtype=torch.float32).view(
                                            batch * traj_len, -1).to(device)
            not_done_mask = torch.tensor(not_done_mask,
                                         dtype=torch.float32).view(
                                             batch * traj_len, -1).to(device)
            is_pad_mask = torch.tensor(is_pad_mask, dtype=torch.float32).view(
                batch * traj_len, -1).to(device)

            # current_Q = network.forward_batch(batch_state).view(-1,4).gather(1, batch_action) * is_pad_mask
            current_Q = network.forward(batch_state).view(
                -1, output_size).gather(1, batch_action) * is_pad_mask
            # current_Q = network(batch_state).view(batch,-1).gather(1, batch_action) * is_pad_mask

            with torch.no_grad():
                if double_dqn:
                    next_state_actions = network(batch_next_state).max(
                        1, keepdim=True)[1]
                    next_Q = target_network(batch_next_state).gather(
                        1, next_state_actions)
                else:
                    next_Q = target_network.forward(batch_next_state).view(
                        -1, output_size).max(1, keepdim=True)[0]

                target_Q = batch_reward + (
                    gamma * next_Q) * not_done_mask * is_pad_mask

            # loss = F.smooth_l1_loss(current_Q, target_Q)
            loss = (current_Q - target_Q).pow(2).mean()
            # all_params = torch.cat([x.view(-1) for x in model.parameters()])
            all_params = torch.cat([x.view(-1) for x in network.parameters()])
            # loss += l1_regularization * torch.norm(all_params, 1)
            #TODO do we want to clamp like this, maybe the intersting info is above abs(1) so we need to use tanh or etc.
            # loss = torch.clamp(loss, min=-1, max=1)

            if step % plot_update_freq == 0:
                print('loss is: %f' % loss)

            loss.backward()
            # found as helpful to limit max grad values
            #         for param in network.parameters():
            #             param.grad.data.clamp_(-1, 1)
            optimizer.step()
            losses.append(loss.item())
            losses_steps.append(step)
            # # plot losses
            # plt.figure(4)
            # plt.plot(losses_steps,losses)
            # plt.title("Losses")
            # env.vis.matplot(plt,win=4)

        # after training session we restore the hidden vector values
        network.hidden = so_far_hidden

        if step % target_update_freq == 0:
            # print('target network update')
            target_network.load_state_dict(network.state_dict())
        # TODO - adapt to atary code
        if step % eval_freq == 0 and step > learn_start:
            network.eval()
            # save the current hidden vector to restore it after training step
            so_far_hidden = network.clone_hidden()

            total_reward = 0
            for eval_ep in range(eval_episodes):

                network.hidden = network.init_hidden()
                eval_state = eval_env.reset()
                while True:
                    # if is_visdom:
                    eval_env.render()

                    # action = network(state).max(1)[1].item()

                    q_value = network(
                        Variable(torch.FloatTensor(
                            np.float32(eval_state)).unsqueeze(0).to(device),
                                 volatile=True))
                    q_value = q_value.view(-1,
                                           output_size).cpu().detach().numpy()
                    action = np.argmax(q_value)

                    if random.random() < 0.01:
                        action = random.randrange(output_size)

                    eval_state, reward, done, _ = eval_env.step(action)

                    total_reward += reward
                    if done:
                        break
            network.train()

            # after evaluation session we restore the hidden vector values
            network.hidden = so_far_hidden

            average_reward = total_reward * 1.0 / eval_episodes
            average_rewards.append(average_reward)
            avg_rew_steps.append(step)
            print('Step: ' + str(step) + ' Avg reward: ' + str(average_reward))
            f.write('Step: ' + str(step) + ' Avg reward: ' +
                    str(average_reward) + '\n')
        # if step > learn_start and len(losses) > 0 and len(average_rewards) > 0 and step % 1000 == 0:
        #     clear_output()
        #     pl.plot(losses_steps, losses)
        #     pl.title('Loss')
        #     pl.show()
        #     pl.plot(avg_rew_steps, average_rewards)
        #     pl.title('Reward')
        #     pl.show()

    tot_avg_reward = sum(average_rewards) / (float(len(average_rewards)) +
                                             0.0000000001)
    print('Run average reward: ' + str(tot_avg_reward))
    f.write('Run average reward: ' + str(tot_avg_reward) + '\n')
    f.close()
    model_path = "model:_lr_{:f}_batch_size:_{:f}_trajectory_length:_{:f}_flickering_p_{:f}_is_rnn:_{:s}".format(
        kwargs['lr'], kwargs['batch'], kwargs['traj_len'],
        kwargs['flickering_p'], str(kwargs['is_rnn']))
    torch.save(network.state_dict(), model_path + str(tot_avg_reward))
    return tot_avg_reward