def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        type=str,
                        required=True,
                        help='Model directory path.')
    parser.add_argument('--out',
                        type=str,
                        required=True,
                        help='ONNX file output path.')
    parser.add_argument('--gpu', type=int, default=0, help='GPU id.')
    args = parser.parse_args()

    # Predefined parameters.
    n_actions = 4  # env.action_space.n
    replay_start_size = 5 * 10**4

    # Load the model.
    q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions),
                            DiscreteActionValue)
    opt = chainer.optimizers.RMSpropGraves(lr=2.5e-4,
                                           alpha=0.95,
                                           momentum=0.0,
                                           eps=1e-2)
    opt.setup(q_func)
    rbuf = replay_buffer.ReplayBuffer(10**6)
    explorer = explorers.LinearDecayEpsilonGreedy(
        start_epsilon=1.0,
        end_epsilon=0.1,
        decay_steps=10**6,
        random_action_func=lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.DQN
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=replay_start_size,
                  target_update_interval=10**4,
                  clip_delta=True,
                  update_interval=4,
                  batch_accumulator='sum',
                  phi=phi)
    agent.load(args.model)

    # Extract core links from the model and export these links as an ONNX format.
    onnx_compat_model = convert_to_compatible_model(agent)
    x = cp.array(np.zeros((1, 4, 84, 84), dtype=np.float32))
    onnx_chainer.export(onnx_compat_model,
                        x,
                        input_names='input',
                        output_names='action',
                        return_named_inout=True,
                        filename=args.out)
Exemple #2
0
def set_explorer(env):

    # Possible parameters:

    # Initial (and max) value of epsilon at the start of the experimentation.
    start_epsilon = 1.0

    # Minimum value of epsilon
    end_epsilon = 0.1

    # Constant epsilon
    cons_epsilon = 0.001

    # how many steps it takes for epsilon to decay
    final_exploration_steps = 10**5

    # Options for exploration (more explorers at site-packages/chainerrl/explorers/)

    # Option 1: Constant
    constant_epsilon_explorer = explorers.ConstantEpsilonGreedy(
        epsilon=cons_epsilon, random_action_func=env.action_space.sample)

    # Option 2: Linear decay
    decay_epsilon_explorer = explorers.LinearDecayEpsilonGreedy(
        start_epsilon,
        end_epsilon,
        final_exploration_steps,
        random_action_func=str(env.action_space.sample))

    return constant_epsilon_explorer
Exemple #3
0
def create_agent(env):
    q_func = QFunction(env.grid_size ** 2, 4)

    start_epsilon = 1.
    end_epsilon = 0.8
    decay_steps = 20000

    explorer = explorers.LinearDecayEpsilonGreedy(start_epsilon, end_epsilon, decay_steps, env.random_action)

    opt = optimizers.Adam()
    opt.setup(q_func)

    rbuf_capacity = 5 * 10 ** 3

    steps = 50000
    replay_start_size = 20
    update_interval = 10
    betasteps = (steps - replay_start_size) // update_interval
    rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity)

    phi = lambda x: x.astype(np.float32, copy=False)

    agent = DQN.DQN(q_func, opt, rbuf, gamma=0.99,
                    explorer=explorer, replay_start_size=replay_start_size,
                    phi=phi, minibatch_size=minibatch_size)
    return agent
    def dqn_q_values_and_neuronal_net(self, args, action_space, obs_size,
                                      obs_space):
        """
        learning process
        """

        if isinstance(action_space, spaces.Box):
            action_size = action_space.low.size
            # Use NAF to apply DQN to continuous action spaces
            q_func = q_functions.FCQuadraticStateQFunction(
                obs_size,
                action_size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
                action_space=action_space)
            # Use the Ornstein-Uhlenbeck process for exploration
            ou_sigma = (action_space.high - action_space.low) * 0.2
            explorer = explorers.AdditiveOU(sigma=ou_sigma)
        else:
            n_actions = action_space.n
            # print("n_actions: ", n_actions)
            q_func = q_functions.FCStateQFunctionWithDiscreteAction(
                obs_size,
                n_actions,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers)
            # print("q_func ", q_func)
            # Use epsilon-greedy for exploration
            explorer = explorers.LinearDecayEpsilonGreedy(
                args.start_epsilon, args.end_epsilon,
                args.final_exploration_steps, action_space.sample)
            # print("explorer: ", explorer)

        if args.noisy_net_sigma is not None:
            links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
            # Turn off explorer
            explorer = explorers.Greedy()
        # print("obs_space.low : ", obs_space.shape)
        chainerrl.misc.draw_computational_graph(
            [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
            os.path.join(args.outdir, 'model'))

        opt = optimizers.Adam()
        opt.setup(q_func)

        rbuf_capacity = 5 * 10**5
        if args.minibatch_size is None:
            args.minibatch_size = 32
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                        // args.update_interval
            rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                         betasteps=betasteps)
        else:
            rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

        return q_func, opt, rbuf, explorer
Exemple #5
0
    def _test_load_iqn(self, gpu):
        q_func = agents.iqn.ImplicitQuantileQFunction(
            psi=links.Sequence(
                L.Convolution2D(None, 32, 8, stride=4),
                F.relu,
                L.Convolution2D(None, 64, 4, stride=2),
                F.relu,
                L.Convolution2D(None, 64, 3, stride=1),
                F.relu,
                functools.partial(F.reshape, shape=(-1, 3136)),
            ),
            phi=links.Sequence(
                agents.iqn.CosineBasisLinear(64, 3136),
                F.relu,
            ),
            f=links.Sequence(
                L.Linear(None, 512),
                F.relu,
                L.Linear(None, 4),
            ),
        )

        opt = chainer.optimizers.Adam(5e-5, eps=1e-2)
        opt.setup(q_func)

        rbuf = replay_buffer.ReplayBuffer(100)

        explorer = explorers.LinearDecayEpsilonGreedy(
            start_epsilon=1.0,
            end_epsilon=0.1,
            decay_steps=10**6,
            random_action_func=lambda: np.random.randint(4))

        agent = agents.IQN(
            q_func,
            opt,
            rbuf,
            gpu=gpu,
            gamma=0.99,
            explorer=explorer,
            replay_start_size=50,
            target_update_interval=10**4,
            update_interval=4,
            batch_accumulator='mean',
            phi=lambda x: x,
            quantile_thresholds_N=64,
            quantile_thresholds_N_prime=64,
            quantile_thresholds_K=32,
        )

        model, exists = download_model("IQN",
                                       "BreakoutNoFrameskip-v4",
                                       model_type=self.pretrained_type)
        agent.load(model)
        if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'):
            assert exists
Exemple #6
0
    def create_agent(env):
        state_size = env.state_size
        action_size = env.action_size
        q_func = QFunction(state_size, action_size)

        start_epsilon = 1
        end_epsilon = 0.3
        decay_steps = 10
        explorer = explorers.LinearDecayEpsilonGreedy(start_epsilon,
                                                      end_epsilon, decay_steps,
                                                      env.random_action)

        opt = optimizers.Adam()
        opt.setup(q_func)

        # rbuf_capacity = 5 * 10 ** 3
        minibatch_size = 8

        # steps = 1000
        replay_start_size = 20
        update_interval = 5
        # betasteps = (steps - replay_start_size) // update_interval
        # rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps)
        rbuf = prioritized.PrioritizedReplayBuffer()

        phi = lambda x: x.astype(np.float32, copy=False)

        # agent = double_dqn.DoubleDQN(q_func,
        #                              opt,
        #                              rbuf,
        #                              gamma=0.99,
        #                              explorer=explorer,
        #                              replay_start_size=replay_start_size,
        #                              target_update_interval=10,  # target q网络多久和q网络同步
        #                              update_interval=update_interval,
        #                              phi=phi,
        #                              minibatch_size=minibatch_size,
        #                              gpu=args.gpu,  # 设置是否使用gpu
        #                              episodic_update_len=16)

        # 自己的DDQN
        agent = MyDoubleDQN(
            q_func,
            opt,
            rbuf,
            gamma=0.99,
            explorer=explorer,
            replay_start_size=replay_start_size,
            target_update_interval=10,  # target q网络多久和q网络同步
            update_interval=update_interval,
            phi=phi,
            minibatch_size=minibatch_size,
            gpu=args.gpu,  # 设置是否使用gpu
            episodic_update_len=16)

        return agent
def create_value_based_learner(cfg_name):
    """
    Creates a learner that can be used with value based algorithms from chainerrl.
    :param cfg_name: type str, the name of the config
    :return: chainerrl agent specified in config
    """
    vb_config = Config(cfg_name)
    network = getattr(models, vb_config.get_str('BASIC', 'network'))(
        **vb_config.get_section('NETWORK'))
    q_func = q_functions.SingleModelStateQFunctionWithDiscreteAction(model=network)
    opt = getattr(optimizers, vb_config.get_str('BASIC', 'optimizer'))(
        **vb_config.get_section('OPTIMIZER'))

    opt.setup(q_func)
    opt.add_hook(
        optimizer.GradientClipping(threshold=vb_config.get_float('BASIC', 'grad_clip')))
    rep_buf = replay_buffer.PrioritizedEpisodicReplayBuffer(
        capacity=vb_config.get_int('MEMORY_BUFFER', 'episodic_buffer_size'),
        wait_priority_after_sampling=vb_config.get_bool('MEMORY_BUFFER',
                                                        'wait_priority_after_sampling'))

    explorer = explorers.LinearDecayEpsilonGreedy(
        random_action_func=lambda: np.random.random_integers(0, vb_config.get_int('NETWORK',
                                                                                  'output_dim') - 1),
        **vb_config.get_section('EXPLORER'))

    try:
        learner = getattr(agents, vb_config.get_str('BASIC', 'learner'))(q_function=q_func,
                                                                         optimizer=opt,
                                                                         replay_buffer=rep_buf,
                                                                         phi=lambda x: x,
                                                                         explorer=explorer,
                                                                         **vb_config.get_section(
                                                                             'ALGORITHM'))
        if vb_config.get_str('BASIC', 'load_path'):
            learner.load(os.path.join(get_results_path(), vb_config.get_str('BASIC', 'load_path')))

    except AttributeError as e:
        logger.log(msg='Cannot find model {} in chainerrl.agents'.format(
            vb_config.get_str('BASIC', 'learner')),
            level=logging.ERROR)
        raise e

    logger.log(msg='Created learner {}'.format(learner.__class__.__name__),
               level=logging.INFO)
    logger.log(msg='Model parameters {}'.format(
        ' '.join([name + ':' + str(value) for name, value in
                  vb_config.get_section('EXPERIMENT').items()])), level=logging.INFO)
    logger.log(msg='Explorer parameters {}'.format(
        ' '.join([name + ':' + str(value) for name, value in
                  vb_config.get_section('EXPLORER').items()])), level=logging.INFO)

    return learner
Exemple #8
0
 def make_agent(process_idx):
     # Random epsilon assignment described in the original paper
     rand = random.random()
     if rand < 0.4:
         epsilon_target = 0.1
     elif rand < 0.7:
         epsilon_target = 0.01
     else:
         epsilon_target = 0.5
     explorer = explorers.LinearDecayEpsilonGreedy(
         1, epsilon_target, args.final_exploration_frames,
         action_space.sample)
     # Suppress the explorer logger
     explorer.logger.setLevel(logging.INFO)
     return nsq.NSQ(q_func, opt, t_max=5, gamma=0.99,
                    i_target=40000,
                    explorer=explorer, phi=dqn_phi)
Exemple #9
0
def create_agent(env):
    state_size = env.state_size
    action_size = env.action_size
    q_func = QFunction(state_size, action_size)

    start_epsilon = 1.
    end_epsilon = 0.3
    decay_steps = feature_max_count * MAX_EPISODE / 2
    explorer = explorers.LinearDecayEpsilonGreedy(start_epsilon, end_epsilon,
                                                  decay_steps,
                                                  env.random_action)

    opt = optimizers.Adam()
    opt.setup(q_func)

    rbuf_capacity = 5 * 10**3
    minibatch_size = 16

    steps = 1000
    replay_start_size = 20
    update_interval = 10
    betasteps = (steps - replay_start_size) // update_interval
    rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                 betasteps=betasteps)

    phi = lambda x: x.astype(np.float32, copy=False)

    agent = DDQN.DoubleDQN(
        q_func,
        opt,
        rbuf,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=replay_start_size,
        target_update_interval=10,  # target q网络多久和q网络同步
        update_interval=update_interval,
        phi=phi,
        minibatch_size=minibatch_size,
        target_update_method='hard',
        soft_update_tau=1e-2,
        episodic_update=False,
        gpu=args.gpu,  # 设置是否使用gpu
        episodic_update_len=16)
    return agent
Exemple #10
0
    def __init__(self, modelpath, n_actions=4, n_stack_frames=4):
        # Predefined parameters.
        replay_start_size = 5 * 10**4

        # Load the model.
        q_func = links.Sequence(links.NatureDQNHead(),
                                L.Linear(512, n_actions), DiscreteActionValue)
        opt = chainer.optimizers.RMSpropGraves(lr=2.5e-4,
                                               alpha=0.95,
                                               momentum=0.0,
                                               eps=1e-2)
        opt.setup(q_func)
        rbuf = replay_buffer.ReplayBuffer(10**6)
        explorer = explorers.LinearDecayEpsilonGreedy(
            start_epsilon=1.0,
            end_epsilon=0.1,
            decay_steps=10**6,
            random_action_func=lambda: np.random.randint(n_actions))

        def phi(x):
            # Feature extractor
            return np.asarray(x, dtype=np.float32) / 255

        Agent = agents.DQN
        self._agent = Agent(q_func,
                            opt,
                            rbuf,
                            gpu=-1,
                            gamma=0.99,
                            explorer=explorer,
                            replay_start_size=replay_start_size,
                            target_update_interval=10**4,
                            clip_delta=True,
                            update_interval=4,
                            batch_accumulator='sum',
                            phi=phi)
        self._agent.load(modelpath)

        self._state = deque([], maxlen=n_stack_frames)
        self._action = 0
Exemple #11
0
    def _test_load_dqn(self, gpu):
        q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, 4),
                                DiscreteActionValue)

        opt = optimizers.RMSpropGraves(lr=2.5e-4,
                                       alpha=0.95,
                                       momentum=0.0,
                                       eps=1e-2)
        opt.setup(q_func)

        rbuf = replay_buffer.ReplayBuffer(100)

        explorer = explorers.LinearDecayEpsilonGreedy(
            start_epsilon=1.0,
            end_epsilon=0.1,
            decay_steps=10**6,
            random_action_func=lambda: np.random.randint(4))

        agent = agents.DQN(q_func,
                           opt,
                           rbuf,
                           gpu=gpu,
                           gamma=0.99,
                           explorer=explorer,
                           replay_start_size=50,
                           target_update_interval=10**4,
                           clip_delta=True,
                           update_interval=4,
                           batch_accumulator='sum',
                           phi=lambda x: x)

        model, exists = download_model("DQN",
                                       "BreakoutNoFrameskip-v4",
                                       model_type=self.pretrained_type)
        agent.load(model)
        if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'):
            assert exists
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames', type=int, default=10**6)
    parser.add_argument('--final-epsilon', type=float, default=0.01)
    parser.add_argument('--eval-epsilon', type=float, default=0.001)
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--arch',
                        type=str,
                        default='doubledqn',
                        choices=['nature', 'nips', 'dueling', 'doubledqn'])
    parser.add_argument('--steps', type=int, default=5 * 10**7)
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--replay-start-size', type=int, default=5 * 10**4)
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=3 * 10**4)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--update-interval', type=int, default=4)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--no-clip-delta',
                        dest='clip_delta',
                        action='store_false')
    parser.set_defaults(clip_delta=True)
    parser.add_argument('--agent',
                        type=str,
                        default='DoubleDQN',
                        choices=['DQN', 'DoubleDQN', 'PAL'])
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--lr',
                        type=float,
                        default=2.5e-4,
                        help='Learning rate')
    parser.add_argument('--prioritized',
                        action='store_true',
                        default=False,
                        help='Use prioritized experience replay.')
    parser.add_argument('--num-envs', type=int, default=1)
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(idx, test):
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
            frame_stack=False,
        )
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        env.seed(env_seed)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        vec_env = chainerrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(args.num_envs))
        ])
        vec_env = chainerrl.wrappers.VectorFrameStack(vec_env, 4)
        return vec_env

    sample_env = make_env(0, test=False)

    n_actions = sample_env.action_space.n
    q_func = parse_arch(args.arch, n_actions)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(lr=args.lr,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    # Select a replay buffer to use
    if args.prioritized:
        # Anneal beta from beta0 to 1 throughout training
        betasteps = args.steps / args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(10**6,
                                                     alpha=0.6,
                                                     beta0=0.4,
                                                     betasteps=betasteps)
    else:
        rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = parse_agent(args.agent)
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=args.target_update_interval,
                  clip_delta=args.clip_delta,
                  update_interval=args.update_interval,
                  batch_accumulator='sum',
                  phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=make_batch_env(test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(test=False),
            eval_env=make_batch_env(test=True),
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            log_interval=1000,
        )
Exemple #13
0
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if(type(args) is list):
        args=make_args(args)
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2 ** 31 - 1 - args.seed

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir,
                mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)
    n_actions = env.action_space.n

    q_func = chainerrl.agents.iqn.ImplicitQuantileQFunction(
        psi=chainerrl.links.Sequence(
            L.Convolution2D(None, 32, 8, stride=4),
            F.relu,
            L.Convolution2D(None, 64, 4, stride=2),
            F.relu,
            L.Convolution2D(None, 64, 3, stride=1),
            F.relu,
            functools.partial(F.reshape, shape=(-1, 3136)),
        ),
        phi=chainerrl.links.Sequence(
            chainerrl.agents.iqn.CosineBasisLinear(64, 3136),
            F.relu,
        ),
        f=chainerrl.links.Sequence(
            L.Linear(None, 512),
            F.relu,
            L.Linear(None, n_actions),
        ),
    )

    # Draw the computational graph and save it in the output directory.
    fake_obss = np.zeros((4, 84, 84), dtype=np.float32)[None]
    fake_taus = np.zeros(32, dtype=np.float32)[None]
    chainerrl.misc.draw_computational_graph(
        [q_func(fake_obss)(fake_taus)],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as https://arxiv.org/abs/1710.10044
    opt = chainer.optimizers.Adam(5e-5, eps=1e-2 / args.batch_size)
    opt.setup(q_func)

    if args.prioritized:
        betasteps = args.steps / args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(
            10 ** 6, alpha=0.5, beta0=0.4, betasteps=betasteps,
            num_steps=args.num_step_return)
    else:
        rbuf = replay_buffer.ReplayBuffer(
            10 ** 6,
            num_steps=args.num_step_return)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon,
        args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = parse_agent(args.agent)
    agent = Agent(
        q_func, opt, rbuf, gpu=args.gpu, gamma=0.99,
        explorer=explorer, replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator=args.batch_accumulator,
        phi=phi,
        quantile_thresholds_N=args.quantile_thresholds_N,
        quantile_thresholds_N_prime=args.quantile_thresholds_N_prime,
        quantile_thresholds_K=args.quantile_thresholds_K,
    )

    if args.load_agent:
        agent.load(args.load_agent)

    if (args.mode=='train'):
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            checkpoint_freq=args.checkpoint_frequency,
            step_offset=args.step_offset,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=True,
            eval_env=eval_env,
            log_type=args.log_type
        )

        dir_of_best_network = os.path.join(args.outdir, "best")
        agent.load(dir_of_best_network)

        # run 200 evaluation episodes, each capped at 30 mins of play
        stats = experiments.evaluator.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.n_best_episodes,
            max_episode_len=args.max_frames / 4,
            logger=None)
        with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f:
            # temporary hack to handle python 2/3 support issues.
            # json dumps does not support non-string literal dict keys
            json_stats = json.dumps(stats)
            print(str(json_stats), file=f)
        print("The results of the best scoring network:")
        for stat in stats:
            print(str(stat) + ":" + str(stats[stat]))
    elif (args.mode=='check'):
        return tools.make_video.check(env=env,agent=agent,save_mp4=args.save_mp4)

    elif (args.mode=='growth'):
        return tools.make_video.growth(env=env,agent=agent,outdir=args.outdir,max_num=args.max_frames,save_mp4=args.save_mp4)
Exemple #14
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='Pendulum-v0')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=10**4)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--steps', type=int, default=10**5)
    parser.add_argument('--prioritized-replay', action='store_true')
    parser.add_argument('--replay-start-size', type=int, default=1000)
    parser.add_argument('--target-update-interval', type=int, default=10**2)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-interval', type=int, default=10**4)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--minibatch-size', type=int, default=None)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e-3)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def make_env(test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if ((args.render_eval and test) or (args.render_train and not test)):
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = env.observation_space
    obs_size = obs_space.low.size
    action_space = env.action_space

    if isinstance(action_space, spaces.Box):
        action_size = action_space.low.size
        # Use NAF to apply DQN to continuous action spaces
        q_func = q_functions.FCQuadraticStateQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            action_space=action_space)
        # Use the Ornstein-Uhlenbeck process for exploration
        ou_sigma = (action_space.high - action_space.low) * 0.2
        explorer = explorers.AdditiveOU(sigma=ou_sigma)
    else:
        n_actions = action_space.n
        q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size,
            n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        # Use epsilon-greedy for exploration
        explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    opt = optimizers.Adam()
    opt.setup(q_func)

    rbuf_capacity = 5 * 10**5
    if args.minibatch_size is None:
        args.minibatch_size = 32
    if args.prioritized_replay:
        betasteps = (args.steps - args.replay_start_size) \
            // args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                     betasteps=betasteps)
    else:
        rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    agent = DQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        minibatch_size=args.minibatch_size,
        target_update_method=args.target_update_method,
        soft_update_tau=args.soft_update_tau,
    )

    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
            train_max_episode_len=timestep_limit)
Exemple #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='BreakoutNoFrameskip-v4',
                        help='OpenAI Atari domain to perform algorithm on.')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=10**6,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon',
                        type=float,
                        default=0.1,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon',
                        type=float,
                        default=0.05,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--arch',
                        type=str,
                        default='doubledqn',
                        choices=['nature', 'nips', 'dueling', 'doubledqn'],
                        help='Network architecture to use.')
    parser.add_argument('--steps',
                        type=int,
                        default=5 * 10**7,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=5 * 10**4,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=1 * 10**4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--eval-interval',
                        type=int,
                        default=10**5,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval',
                        type=int,
                        default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--no-clip-delta',
                        dest='clip_delta',
                        action='store_false')
    parser.set_defaults(clip_delta=True)

    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--lr',
                        type=float,
                        default=2.5e-4,
                        help='Learning rate.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions),
                            DiscreteActionValue)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(lr=args.lr,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.DQN
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=args.target_update_interval,
                  clip_delta=args.clip_delta,
                  update_interval=args.update_interval,
                  batch_accumulator='sum',
                  phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            eval_env=eval_env,
        )
Exemple #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--seed', type=int, default=123,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu', type=int, default=-1)
    parser.add_argument('--final-exploration-steps',
                        type=int, default=10 ** 4)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--steps', type=int, default=50000)
    parser.add_argument('--prioritized-replay', action='store_true', default=False)
    parser.add_argument('--episodic-replay', action='store_true', default=False)
    parser.add_argument('--replay-start-size', type=int, default=1000)
    parser.add_argument('--target-update-interval', type=int, default=10 ** 2)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=50)
    parser.add_argument('--eval-interval', type=int, default=10 ** 3)
    parser.add_argument('--n-hidden-channels', type=int, default=512)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--minibatch-size', type=int, default=None)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--monitor', action='store_true', default=True)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-3)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(
        args, args.outdir, argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        ENV_NAME = 'malware-test-v0' if test else 'malware-v0'
        env = gym.make(ENV_NAME)
        # Use different random seeds for train and test envs
        env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        # if not test:
        #     misc.env_modifiers.make_reward_filtered(
        #         env, lambda x: x * args.reward_scale_factor)
        if ((args.render_eval and test) or
                (args.render_train and not test)):
            misc.env_modifiers.make_rendered(env)
        return env

    env = make_env(test=False)
    timestep_limit = 80
    obs_space = env.observation_space
    obs_size = obs_space.shape[0]
    action_space = env.action_space

    n_actions = action_space.n
    q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size, n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
    if args.gpu >= 0:
        q_func.to_gpu(args.gpu)

    # Use epsilon-greedy for exploration
    explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    if args.gpu < 0:
        chainerrl.misc.draw_computational_graph(
            [q_func(np.zeros_like(obs_space, dtype=np.float32)[None])],
            os.path.join(args.outdir, 'model'))

    opt = optimizers.Adam()
    opt.setup(q_func)

    rbuf_capacity = 5 * 10 ** 5
    if args.episodic_replay:
        if args.minibatch_size is None:
            args.minibatch_size = 4
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                        // args.update_interval
            rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer(
                rbuf_capacity, betasteps=betasteps)
        else:
            rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity)
    else:
        if args.minibatch_size is None:
            args.minibatch_size = 32
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                        // args.update_interval
            rbuf = replay_buffer.PrioritizedReplayBuffer(
                rbuf_capacity, betasteps=betasteps)
        else:
            rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    def phi(obs):
        return obs.astype(np.float32)

    agent = DoubleDQN(q_func, opt, rbuf, gamma=args.gamma,
                      explorer=explorer, replay_start_size=args.replay_start_size,
                      target_update_interval=args.target_update_interval,
                      update_interval=args.update_interval,
                      phi=phi, minibatch_size=args.minibatch_size,
                      target_update_method=args.target_update_method,
                      soft_update_tau=args.soft_update_tau,
                      episodic_update=args.episodic_replay, episodic_update_len=16)

    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        q_hook = PlotHook('Average Q Value')
        loss_hook = PlotHook('Average Loss', plot_index=1)

        experiments.train_agent_with_evaluation(
            agent=agent, env=env, steps=args.steps,
            eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval,
            outdir=args.outdir, eval_env=eval_env,
            max_episode_len=timestep_limit,
            step_hooks=[q_hook, loss_hook],
            successful_score=7
        )
Exemple #17
0
    def create_ddqn_agent(env, args):
        obs_size = env.observation_space.shape[0]
        action_space = env.action_space
        n_actions = action_space.n

        # q_func = q_functions.FCStateQFunctionWithDiscreteAction(
        #     obs_size, n_actions,
        #     n_hidden_channels=args.n_hidden_channels,
        #     n_hidden_layers=args.n_hidden_layers)
        q_func = QFunction(obs_size, n_actions)
        if args.gpu:
            q_func.to_gpu(args.gpu)

        # Draw the computational graph and save it in the output directory.
        if not args.test and not args.gpu:
            chainerrl.misc.draw_computational_graph(
                [q_func(np.zeros_like(env.observation_space, dtype=np.float32)[None])],
                os.path.join(args.outdir, 'model'))

        # Use epsilon-greedy for exploration
        explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)
        # explorer = explorers.Boltzmann()
        # explorer = explorers.ConstantEpsilonGreedy(
        #     epsilon=0.3, random_action_func=env.action_space.sample)

        opt = optimizers.Adam()
        opt.setup(q_func)

        rbuf_capacity = 5 * 10 ** 3
        if args.episodic_replay:
            if args.minibatch_size is None:
                args.minibatch_size = 4
            if args.prioritized_replay:
                betasteps = (args.steps - args.replay_start_size) // args.update_interval
                rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer(rbuf_capacity, betasteps=betasteps)
            else:
                rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity)
        else:
            if args.minibatch_size is None:
                args.minibatch_size = 32
            if args.prioritized_replay:
                betasteps = (args.steps - args.replay_start_size) // args.update_interval
                rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity, betasteps=betasteps)
            else:
                rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

        # Chainer only accepts numpy.float32 by default, make sure
        # a converter as a feature extractor function phi.
        phi = lambda x: x.astype(np.float32, copy=False)

        agent = chainerrl.agents.DoubleDQN(q_func, opt, rbuf, gamma=args.gamma,
                                           explorer=explorer, replay_start_size=args.replay_start_size,
                                           target_update_interval=args.target_update_interval,
                                           update_interval=args.update_interval,
                                           phi=phi, minibatch_size=args.minibatch_size,
                                           target_update_method=args.target_update_method,
                                           soft_update_tau=args.soft_update_tau,
                                           episodic_update=args.episodic_replay,
                                           episodic_update_len=16)

        return agent
Exemple #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames', type=int, default=10**6)
    parser.add_argument('--final-epsilon', type=float, default=0.01)
    parser.add_argument('--eval-epsilon', type=float, default=0.001)
    parser.add_argument('--steps', type=int, default=5 * 10**7)
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--replay-start-size', type=int, default=5 * 10**4)
    parser.add_argument('--target-update-interval', type=int, default=10**4)
    parser.add_argument('--eval-interval', type=int, default=250000)
    parser.add_argument('--eval-n-steps', type=int, default=125000)
    parser.add_argument('--update-interval', type=int, default=4)
    parser.add_argument('--batch-size', type=int, default=32)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--batch-accumulator',
                        type=str,
                        default='mean',
                        choices=['mean', 'sum'])
    parser.add_argument('--quantile-thresholds-N', type=int, default=64)
    parser.add_argument('--quantile-thresholds-N-prime', type=int, default=64)
    parser.add_argument('--quantile-thresholds-K', type=int, default=32)
    parser.add_argument('--n-best-episodes', type=int, default=200)
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)
    n_actions = env.action_space.n

    q_func = chainerrl.agents.iqn.ImplicitQuantileQFunction(
        psi=chainerrl.links.Sequence(
            L.Convolution2D(None, 32, 8, stride=4),
            F.relu,
            L.Convolution2D(None, 64, 4, stride=2),
            F.relu,
            L.Convolution2D(None, 64, 3, stride=1),
            F.relu,
            functools.partial(F.reshape, shape=(-1, 3136)),
        ),
        phi=chainerrl.links.Sequence(
            chainerrl.agents.iqn.CosineBasisLinear(64, 3136),
            F.relu,
        ),
        f=chainerrl.links.Sequence(
            L.Linear(None, 512),
            F.relu,
            L.Linear(None, n_actions),
        ),
    )

    # Draw the computational graph and save it in the output directory.
    fake_obss = np.zeros((4, 84, 84), dtype=np.float32)[None]
    fake_taus = np.zeros(32, dtype=np.float32)[None]
    chainerrl.misc.draw_computational_graph([q_func(fake_obss)(fake_taus)],
                                            os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as https://arxiv.org/abs/1710.10044
    opt = chainer.optimizers.Adam(5e-5, eps=1e-2 / args.batch_size)
    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = chainerrl.agents.IQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator=args.batch_accumulator,
        phi=phi,
        quantile_thresholds_N=args.quantile_thresholds_N,
        quantile_thresholds_N_prime=args.quantile_thresholds_N_prime,
        quantile_thresholds_K=args.quantile_thresholds_K,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=args.eval_n_steps,
            n_episodes=None,
        )
        print('n_steps: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_steps, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=True,
            eval_env=eval_env,
        )

        dir_of_best_network = os.path.join(args.outdir, "best")
        agent.load(dir_of_best_network)

        # run 200 evaluation episodes, each capped at 30 mins of play
        stats = experiments.evaluator.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.n_best_episodes,
            max_episode_len=args.max_frames / 4,
            logger=None)
        with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f:
            # temporary hack to handle python 2/3 support issues.
            # json dumps does not support non-string literal dict keys
            json_stats = json.dumps(stats)
            print(str(json_stats), file=f)
        print("The results of the best scoring network:")
        for stat in stats:
            print(str(stat) + ":" + str(stats[stat]))
Exemple #19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--out_dir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames',
                        type=int, default=10 ** 5,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon', type=float, default=0.1,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon', type=float, default=0.05,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--steps', type=int, default=10 ** 6,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--max-episode-len', type=int,
                        default=30 * 60 * 60 // 4,  # 30 minutes with 60/4 fps
                        help='Maximum number of timesteps for each episode.')
    parser.add_argument('--replay-start-size', type=int, default=1000,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int, default=1 * 10 ** 4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--eval-interval', type=int, default=10 ** 5,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval', type=int, default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--logging-level', type=int, default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--lr', type=float, default=2.5e-4,
                        help='Learning rate.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    experiments.set_log_base_dir(args.out_dir)
    print('Output files are saved in {}'.format(args.out_dir))

    env = make_env(env_seed=args.seed)

    n_actions = env.action_space.n
    
    q_func = links.Sequence(
        links.NatureDQNHead(n_input_channels=3),
        L.Linear(512, n_actions),
        DiscreteActionValue
    )

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(
        lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10 ** 6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon,
        args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        x = x.transpose(2, 0, 1)
        return np.asarray(x, dtype=np.float32) / 255

    agent = agents.DQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator='sum',
        phi=phi
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.out_dir,
            save_best_so_far_agent=False,
            max_episode_len=args.max_episode_len,
            eval_env=env,
        )
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true', default=False)
    parser.add_argument('--final-exploration-frames', type=int, default=10**6)
    parser.add_argument('--final-epsilon', type=float, default=0.1)
    parser.add_argument('--eval-epsilon', type=float, default=0.05)
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--replay-start-size', type=int, default=5 * 10**4)
    parser.add_argument('--target-update-interval', type=int, default=10**4)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--update-interval', type=int, default=4)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--batch-size', type=int, default=32)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n

    n_atoms = 51
    v_max = 10
    v_min = -10
    q_func = chainerrl.links.Sequence(
        chainerrl.links.NatureDQNHead(),
        chainerrl.q_functions.DistributionalFCStateQFunctionWithDiscreteAction(
            None,
            n_actions,
            n_atoms,
            v_min,
            v_max,
            n_hidden_channels=0,
            n_hidden_layers=0),
    )

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as https://arxiv.org/abs/1707.06887
    opt = chainer.optimizers.Adam(2.5e-4, eps=1e-2 / args.batch_size)
    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = chainerrl.agents.CategoricalDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator='mean',
        phi=phi,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_steps=None,
                                                  n_episodes=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            eval_env=eval_env,
        )
Exemple #21
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir', type=str, default='dqn_out')
    parser.add_argument('--env', type=str, default='Pendulum-v0')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=10**4)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--steps', type=int, default=10**5)
    parser.add_argument('--prioritized-replay', action='store_true')
    parser.add_argument('--episodic-replay', action='store_true')
    parser.add_argument('--replay-start-size', type=int, default=1000)
    parser.add_argument('--target-update-interval', type=int, default=10**2)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-interval', type=int, default=10**4)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--minibatch-size', type=int, default=None)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e-3)
    args = parser.parse_args()

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def make_env(for_eval):
        env = gym.make(args.env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if not for_eval:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if ((args.render_eval and for_eval)
                or (args.render_train and not for_eval)):
            misc.env_modifiers.make_rendered(env)
        return env

    env = make_env(for_eval=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = env.observation_space
    obs_size = obs_space.low.size
    action_space = env.action_space

    if isinstance(action_space, spaces.Box):
        action_size = action_space.low.size
        # Use NAF to apply DQN to continuous action spaces
        q_func = q_functions.FCQuadraticStateQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            action_space=action_space)
        # Use the Ornstein-Uhlenbeck process for exploration
        ou_sigma = (action_space.high - action_space.low) * 0.2
        explorer = explorers.AdditiveOU(sigma=ou_sigma)
    else:
        n_actions = action_space.n
        q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size,
            n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        # Use epsilon-greedy for exploration
        explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    opt = optimizers.Adam()
    opt.setup(q_func)

    rbuf_capacity = 5 * 10**5
    if args.episodic_replay:
        if args.minibatch_size is None:
            args.minibatch_size = 4
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                // args.update_interval
            rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer(
                rbuf_capacity, betasteps=betasteps)
        else:
            rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity)
    else:
        if args.minibatch_size is None:
            args.minibatch_size = 32
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                // args.update_interval
            rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                         betasteps=betasteps)
        else:
            rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    def phi(obs):
        return obs.astype(np.float32)

    agent = DQN(q_func,
                opt,
                rbuf,
                gpu=args.gpu,
                gamma=args.gamma,
                explorer=explorer,
                replay_start_size=args.replay_start_size,
                target_update_interval=args.target_update_interval,
                update_interval=args.update_interval,
                phi=phi,
                minibatch_size=args.minibatch_size,
                target_update_method=args.target_update_method,
                soft_update_tau=args.soft_update_tau,
                episodic_update=args.episodic_replay,
                episodic_update_len=16)

    if args.load:
        agent.load(args.load)

    eval_env = make_env(for_eval=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
            max_episode_len=timestep_limit)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('rom', type=str)
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true', default=False)
    parser.add_argument('--final-exploration-frames', type=int, default=10**6)
    parser.add_argument('--final-epsilon', type=float, default=0.1)
    parser.add_argument('--eval-epsilon', type=float, default=0.05)
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument('--replay-start-size', type=int, default=5 * 10**4)
    parser.add_argument('--target-update-interval', type=int, default=10**4)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--update-interval', type=int, default=4)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--batch-size', type=int, default=32)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    # In training, life loss is considered as terminal states
    env = ale.ALE(args.rom, use_sdl=args.use_sdl, seed=train_seed)
    misc.env_modifiers.make_reward_clipped(env, -1, 1)
    # In testing, an episode is terminated  when all lives are lost
    eval_env = ale.ALE(args.rom,
                       use_sdl=args.use_sdl,
                       treat_life_lost_as_terminal=False,
                       seed=test_seed)

    n_actions = env.number_of_actions

    n_atoms = 51
    v_max = 10
    v_min = -10
    q_func = chainerrl.links.Sequence(
        chainerrl.links.NatureDQNHead(),
        chainerrl.q_functions.DistributionalFCStateQFunctionWithDiscreteAction(
            None,
            n_actions,
            n_atoms,
            v_min,
            v_max,
            n_hidden_channels=0,
            n_hidden_layers=0),
    )

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as https://arxiv.org/abs/1707.06887
    opt = chainer.optimizers.Adam(2.5e-4, eps=1e-2 / args.batch_size)
    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))
    agent = chainerrl.agents.CategoricalDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator='mean',
        phi=dqn_phi,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # In testing DQN, randomly select 5% of actions
        eval_explorer = explorers.ConstantEpsilonGreedy(
            args.eval_epsilon, lambda: np.random.randint(n_actions))
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_explorer=eval_explorer,
            save_best_so_far_agent=False,
            eval_env=eval_env)
Exemple #23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true', default=False)
    parser.add_argument('--final-exploration-frames', type=int, default=10**6)
    parser.add_argument('--final-epsilon', type=float, default=0.1)
    parser.add_argument('--eval-epsilon', type=float, default=0.05)
    parser.add_argument('--arch',
                        type=str,
                        default='nature',
                        choices=['nature', 'nips', 'dueling'])
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument(
        '--max-episode-len',
        type=int,
        default=5 * 60 * 60 // 4,  # 5 minutes with 60/4 fps
        help='Maximum number of steps for each episode.')
    parser.add_argument('--replay-start-size', type=int, default=5 * 10**4)
    parser.add_argument('--target-update-interval', type=int, default=10**4)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--update-interval', type=int, default=4)
    parser.add_argument('--activation', type=str, default='relu')
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--no-clip-delta',
                        dest='clip_delta',
                        action='store_false')
    parser.set_defaults(clip_delta=True)
    parser.add_argument('--agent',
                        type=str,
                        default='DQN',
                        choices=['DQN', 'DoubleDQN', 'PAL'])
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(args.env),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            misc.env_modifiers.make_rendered(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    activation = parse_activation(args.activation)
    q_func = parse_arch(args.arch, n_actions, activation)

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(lr=2.5e-4,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = parse_agent(args.agent)
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=args.target_update_interval,
                  clip_delta=args.clip_delta,
                  update_interval=args.update_interval,
                  batch_accumulator='sum',
                  phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        # In testing DQN, randomly select 5% of actions
        eval_explorer = explorers.ConstantEpsilonGreedy(
            args.eval_epsilon, lambda: np.random.randint(n_actions))
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_explorer=eval_explorer,
            save_best_so_far_agent=False,
            max_episode_len=args.max_episode_len,
            eval_env=eval_env,
        )
Exemple #24
0
def main():
    """Parses arguments and runs the example
    """

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--env',
        type=str,
        default='MineRLTreechop-v0',
        choices=[
            'MineRLTreechop-v0',
            'MineRLNavigate-v0',
            'MineRLNavigateDense-v0',
            'MineRLNavigateExtreme-v0',
            'MineRLNavigateExtremeDense-v0',
            'MineRLObtainIronPickaxe-v0',
            'MineRLObtainIronPickaxeDense-v0',
            'MineRLObtainDiamond-v0',
            'MineRLObtainDiamondDense-v0',
            'MineRLNavigateDenseFixed-v0'  # for debug use
        ],
        help='MineRL environment identifier')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=-1,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=10**6,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon',
                        type=float,
                        default=0.01,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon',
                        type=float,
                        default=0.001,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=1000,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=10**4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--update-interval',
                        type=int,
                        default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--no-clip-delta',
                        dest='clip_delta',
                        action='store_false')
    parser.add_argument('--error-max', type=float, default=1.0)
    parser.add_argument('--num-step-return', type=int, default=10)
    parser.set_defaults(clip_delta=True)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--logging-filename', type=str, default=None)
    parser.add_argument(
        '--monitor',
        action='store_true',
        default=False,
        help=
        'Monitor env. Videos and additional information are saved as output files when evaluation'
    )
    # parser.add_argument('--render', action='store_true', default=False,
    # help='Render env states in a GUI window.')
    parser.add_argument('--optimizer',
                        type=str,
                        default='rmsprop',
                        choices=['rmsprop', 'adam'])
    parser.add_argument('--lr',
                        type=float,
                        default=2.5e-4,
                        help='Learning rate')
    parser.add_argument(
        "--replay-buffer-size",
        type=int,
        default=10**6,
        help="Size of replay buffer (Excluding demonstrations)")
    parser.add_argument("--minibatch-size", type=int, default=32)
    parser.add_argument('--batch-accumulator', type=str, default="sum")
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument("--save-demo-trajectories",
                        action="store_true",
                        default=False)

    # DQfD specific parameters for loading and pretraining.
    parser.add_argument('--n-experts', type=int, default=10)
    parser.add_argument('--expert-demo-path', type=str, default=None)
    parser.add_argument('--n-pretrain-steps', type=int, default=750000)
    parser.add_argument('--demo-supervised-margin', type=float, default=0.8)
    parser.add_argument('--loss-coeff-l2', type=float, default=1e-5)
    parser.add_argument('--loss-coeff-nstep', type=float, default=1.0)
    parser.add_argument('--loss-coeff-supervised', type=float, default=1.0)
    parser.add_argument('--bonus-priority-agent', type=float, default=0.001)
    parser.add_argument('--bonus-priority-demo', type=float, default=1.0)

    # Action branching architecture
    parser.add_argument('--gradient-clipping',
                        action='store_true',
                        default=False)
    parser.add_argument('--gradient-rescaling',
                        action='store_true',
                        default=False)

    # NoisyNet parameters
    parser.add_argument('--use-noisy-net',
                        type=str,
                        default=None,
                        choices=['before-pretraining', 'after-pretraining'])
    parser.add_argument('--noisy-net-sigma', type=float, default=0.5)

    # Parameters for state/action handling
    parser.add_argument('--frame-stack',
                        type=int,
                        default=None,
                        help='Number of frames stacked (None for disable).')
    parser.add_argument('--frame-skip',
                        type=int,
                        default=None,
                        help='Number of frames skipped (None for disable).')
    parser.add_argument('--camera-atomic-actions', type=int, default=10)
    parser.add_argument('--max-range-of-camera', type=float, default=10.)
    parser.add_argument('--use-full-observation',
                        action='store_true',
                        default=False)
    args = parser.parse_args()

    assert args.expert_demo_path is not None, "DQfD needs collected \
                        expert demonstrations"

    import logging

    if args.logging_filename is not None:
        logging.basicConfig(filename=args.logging_filename,
                            filemode='w',
                            level=args.logging_level)
    else:
        logging.basicConfig(level=args.logging_level)

    logger = logging.getLogger(__name__)

    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    chainerrl.misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    logger.info('Output files are saved in {}'.format(args.outdir))

    if args.env == 'MineRLTreechop-v0':
        branch_sizes = [
            9, 16, args.camera_atomic_actions, args.camera_atomic_actions
        ]
    elif args.env in [
            'MineRLNavigate-v0', 'MineRLNavigateDense-v0',
            'MineRLNavigateExtreme-v0', 'MineRLNavigateExtremeDense-v0'
    ]:
        branch_sizes = [
            9, 16, args.camera_atomic_actions, args.camera_atomic_actions, 2
        ]
    elif args.env in [
            'MineRLObtainIronPickaxe-v0', 'MineRLObtainIronPickaxeDense-v0',
            'MineRLObtainDiamond-v0', 'MineRLObtainDiamondDense-v0'
    ]:
        branch_sizes = [
            9, 16, args.camera_atomic_actions, args.camera_atomic_actions, 32
        ]
    else:
        raise Exception("Unknown environment")

    def make_env(env, test):
        # wrap env: observation...
        # NOTE: wrapping order matters!
        if args.use_full_observation:
            env = FullObservationSpaceWrapper(env)
        elif args.env.startswith('MineRLNavigate'):
            env = PoVWithCompassAngleWrapper(env)
        else:
            env = ObtainPoVWrapper(env)
        if test and args.monitor:
            env = gym.wrappers.Monitor(
                env,
                os.path.join(args.outdir, 'monitor'),
                mode='evaluation' if test else 'training',
                video_callable=lambda episode_id: True)
        if args.frame_skip is not None:
            env = FrameSkip(env, skip=args.frame_skip)

        # convert hwc -> chw as Chainer requires
        env = MoveAxisWrapper(env,
                              source=-1,
                              destination=0,
                              use_tuple=args.use_full_observation)
        #env = ScaledFloatFrame(env)
        if args.frame_stack is not None:
            env = FrameStack(env,
                             args.frame_stack,
                             channel_order='chw',
                             use_tuple=args.use_full_observation)

        # wrap env: action...
        env = BranchedActionWrapper(env, branch_sizes,
                                    args.camera_atomic_actions,
                                    args.max_range_of_camera)

        if test:
            env = BranchedRandomizedAction(env, branch_sizes,
                                           args.eval_epsilon)

        env_seed = test_seed if test else train_seed
        env.seed(int(env_seed))
        return env

    core_env = gym.make(args.env)
    env = make_env(core_env, test=False)
    eval_env = make_env(core_env, test=True)

    # Q function
    if args.env.startswith('MineRLNavigate'):
        if args.use_full_observation:
            base_channels = 3  # RGB
        else:
            base_channels = 4  # RGB + compass
    elif args.env.startswith('MineRLObtain'):
        base_channels = 3  # RGB
    else:
        base_channels = 3  # RGB

    if args.frame_stack is None:
        n_input_channels = base_channels
    else:
        n_input_channels = base_channels * args.frame_stack

    q_func = CNNBranchingQFunction(branch_sizes,
                                   n_input_channels=n_input_channels,
                                   gradient_rescaling=args.gradient_rescaling,
                                   use_tuple=args.use_full_observation)

    def phi(x):
        # observation -> NN input
        if args.use_full_observation:
            pov = np.asarray(x[0], dtype=np.float32)
            others = np.asarray(x[1], dtype=np.float32)
            return (pov / 255, others)
        else:
            return np.asarray(x, dtype=np.float32) / 255

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.array([np.random.randint(n) for n in branch_sizes]))

    # Draw the computational graph and save it in the output directory.
    if args.use_full_observation:
        sample_obs = tuple([x[None] for x in env.observation_space.sample()])
    else:
        sample_obs = env.observation_space.sample()[None]

    chainerrl.misc.draw_computational_graph([q_func(phi(sample_obs))],
                                            os.path.join(args.outdir, 'model'))

    if args.optimizer == 'rmsprop':
        opt = chainer.optimizers.RMSpropGraves(args.lr,
                                               alpha=0.95,
                                               momentum=0.0,
                                               eps=1e-2)
    elif args.optimizer == 'adam':
        opt = chainer.optimizers.Adam(args.lr)

    if args.use_noisy_net is None:
        opt.setup(q_func)

    if args.gradient_rescaling:
        opt.add_hook(ScaleGradHook(1 / (1 + len(q_func.branch_sizes))))
    if args.gradient_clipping:
        opt.add_hook(chainer.optimizer_hooks.GradientClipping(10.0))

    # calculate corresponding `steps` and `eval_interval` according to frameskip
    maximum_frames = 8640000  # = 1440 episodes if we count an episode as 6000 frames.
    if args.frame_skip is None:
        steps = maximum_frames
        eval_interval = 6000 * 100  # (approx.) every 100 episode (counts "1 episode = 6000 steps")
    else:
        steps = maximum_frames // args.frame_skip
        eval_interval = 6000 * 100 // args.frame_skip  # (approx.) every 100 episode (counts "1 episode = 6000 steps")

    # Anneal beta from beta0 to 1 throughout training
    betasteps = steps / args.update_interval
    replay_buffer = PrioritizedDemoReplayBuffer(args.replay_buffer_size,
                                                alpha=0.4,
                                                beta0=0.6,
                                                betasteps=betasteps,
                                                error_max=args.error_max,
                                                num_steps=args.num_step_return)

    # Fill the demo buffer with expert transitions
    if not args.demo:
        chosen_dirs = choose_top_experts(args.expert_demo_path,
                                         args.n_experts,
                                         logger=logger)

        fill_buffer(args.env,
                    chosen_dirs,
                    replay_buffer,
                    args.frame_skip,
                    args.frame_stack,
                    args.camera_atomic_actions,
                    args.max_range_of_camera,
                    args.use_full_observation,
                    logger=logger)

        logger.info("Demo buffer loaded with {} transitions".format(
            len(replay_buffer)))

    def reward_transform(x):
        return np.sign(x) * np.log(1 + np.abs(x))

    if args.use_noisy_net is not None and args.use_noisy_net == 'before-pretraining':
        chainerrl.links.to_factorized_noisy(q_func,
                                            sigma_scale=args.noisy_net_sigma)
        explorer = explorers.Greedy()

        opt.setup(q_func)

    agent = DQfD(q_func,
                 opt,
                 replay_buffer,
                 gamma=0.99,
                 explorer=explorer,
                 n_pretrain_steps=args.n_pretrain_steps,
                 demo_supervised_margin=args.demo_supervised_margin,
                 bonus_priority_agent=args.bonus_priority_agent,
                 bonus_priority_demo=args.bonus_priority_demo,
                 loss_coeff_nstep=args.loss_coeff_nstep,
                 loss_coeff_supervised=args.loss_coeff_supervised,
                 loss_coeff_l2=args.loss_coeff_l2,
                 gpu=args.gpu,
                 replay_start_size=args.replay_start_size,
                 target_update_interval=args.target_update_interval,
                 clip_delta=args.clip_delta,
                 update_interval=args.update_interval,
                 batch_accumulator=args.batch_accumulator,
                 phi=phi,
                 reward_transform=reward_transform,
                 minibatch_size=args.minibatch_size)

    if args.use_noisy_net is not None and args.use_noisy_net == 'after-pretraining':
        chainerrl.links.to_factorized_noisy(q_func,
                                            sigma_scale=args.noisy_net_sigma)
        explorer = explorers.Greedy()

        if args.optimizer == 'rmsprop':
            opt = chainer.optimizers.RMSpropGraves(args.lr,
                                                   alpha=0.95,
                                                   momentum=0.0,
                                                   eps=1e-2)
        elif args.optimizer == 'adam':
            opt = chainer.optimizers.Adam(args.lr)
        opt.setup(q_func)
        opt.add_hook(chainer.optimizer_hooks.WeightDecay(args.loss_coeff_l2))
        agent.optimizer = opt

        agent.target_model = None
        agent.sync_target_network()

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_steps=None,
                                                  n_episodes=args.eval_n_runs)
        logger.info('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        agent.pretrain()

        evaluator = Evaluator(agent=agent,
                              n_steps=None,
                              n_episodes=args.eval_n_runs,
                              eval_interval=eval_interval,
                              outdir=args.outdir,
                              max_episode_len=None,
                              env=eval_env,
                              step_offset=0,
                              save_best_so_far_agent=True,
                              logger=logger)

        # Evaluate the agent BEFORE training begins
        evaluator.evaluate_and_update_max_score(t=0, episodes=0)

        experiments.train_agent(agent=agent,
                                env=env,
                                steps=steps,
                                outdir=args.outdir,
                                max_episode_len=None,
                                step_offset=0,
                                evaluator=evaluator,
                                successful_score=None,
                                step_hooks=[])

    env.close()
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='CartPole-v1')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=1000)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--steps', type=int, default=10**8)
    parser.add_argument('--prioritized-replay', action='store_true')
    parser.add_argument('--episodic-replay', action='store_true')
    parser.add_argument('--replay-start-size', type=int, default=50)
    parser.add_argument('--target-update-interval', type=int, default=100)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-interval', type=int, default=1000)
    parser.add_argument('--n-hidden-channels', type=int, default=12)
    parser.add_argument('--n-hidden-layers', type=int, default=3)
    parser.add_argument('--gamma', type=float, default=0.95)
    parser.add_argument('--minibatch-size', type=int, default=None)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1.0)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        env = gym.make(args.env)
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if ((args.render_eval and test) or (args.render_train and not test)):
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_size = env.observation_space.low.size
    action_space = env.action_space

    n_atoms = 51
    v_max = 500
    v_min = 0

    n_actions = action_space.n
    q_func = q_functions.DistributionalFCStateQFunctionWithDiscreteAction(
        obs_size,
        n_actions,
        n_atoms,
        v_min,
        v_max,
        n_hidden_channels=args.n_hidden_channels,
        n_hidden_layers=args.n_hidden_layers)
    # Use epsilon-greedy for exploration
    explorer = explorers.LinearDecayEpsilonGreedy(args.start_epsilon,
                                                  args.end_epsilon,
                                                  args.final_exploration_steps,
                                                  action_space.sample)

    opt = optimizers.Adam(1e-3)
    opt.setup(q_func)

    rbuf_capacity = 50000  # 5 * 10 ** 5
    if args.episodic_replay:
        if args.minibatch_size is None:
            args.minibatch_size = 4
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                // args.update_interval
            rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer(
                rbuf_capacity, betasteps=betasteps)
        else:
            rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity)
    else:
        if args.minibatch_size is None:
            args.minibatch_size = 32
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                // args.update_interval
            rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                         betasteps=betasteps)
        else:
            rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    agent = chainerrl.agents.CategoricalDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        minibatch_size=args.minibatch_size,
        target_update_method=args.target_update_method,
        soft_update_tau=args.soft_update_tau,
        episodic_update=args.episodic_replay,
        episodic_update_len=16)

    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
            max_episode_len=timestep_limit)
Exemple #26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='BreakoutNoFrameskip-v4',
                        help='OpenAI Atari domain to perform algorithm on.')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=10**6,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon',
                        type=float,
                        default=0.01,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon',
                        type=float,
                        default=0.001,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--steps',
                        type=int,
                        default=5 * 10**7,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=5 * 10**4,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=3 * 10**4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--demo-n-episodes', type=int, default=30)
    parser.add_argument('--eval-n-steps', type=int, default=125000)
    parser.add_argument('--eval-interval',
                        type=int,
                        default=250000,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval',
                        type=int,
                        default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--lr',
                        type=float,
                        default=2.5e-4,
                        help='Learning rate.')
    parser.add_argument('--recurrent',
                        action='store_true',
                        default=False,
                        help='Use a recurrent model. See the code for the'
                        ' model definition.')
    parser.add_argument('--flicker',
                        action='store_true',
                        default=False,
                        help='Use so-called flickering Atari, where each'
                        ' screen is blacked out with probability 0.5.')
    parser.add_argument('--no-frame-stack',
                        action='store_true',
                        default=False,
                        help='Disable frame stacking so that the agent can'
                        ' only see the current screen.')
    parser.add_argument('--episodic-update-len',
                        type=int,
                        default=10,
                        help='Maximum length of sequences for updating'
                        ' recurrent models')
    parser.add_argument('--batch-size',
                        type=int,
                        default=32,
                        help='Number of transitions (in a non-recurrent case)'
                        ' or sequences (in a recurrent case) used for an'
                        ' update.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
            flicker=args.flicker,
            frame_stack=not args.no_frame_stack,
        )
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)
    print('Observation space', env.observation_space)
    print('Action space', env.action_space)

    n_actions = env.action_space.n
    if args.recurrent:
        # Q-network with LSTM
        q_func = chainerrl.links.StatelessRecurrentSequential(
            L.Convolution2D(None, 32, 8, stride=4),
            F.relu,
            L.Convolution2D(None, 64, 4, stride=2),
            F.relu,
            L.Convolution2D(None, 64, 3, stride=1),
            functools.partial(F.reshape, shape=(-1, 3136)),
            F.relu,
            L.NStepLSTM(1, 3136, 512, 0),
            L.Linear(None, n_actions),
            DiscreteActionValue,
        )
        # Replay buffer that stores whole episodes
        rbuf = replay_buffer.EpisodicReplayBuffer(10**6)
    else:
        # Q-network without LSTM
        q_func = chainer.Sequential(
            L.Convolution2D(None, 32, 8, stride=4),
            F.relu,
            L.Convolution2D(None, 64, 4, stride=2),
            F.relu,
            L.Convolution2D(None, 64, 3, stride=1),
            functools.partial(F.reshape, shape=(-1, 3136)),
            L.Linear(None, 512),
            F.relu,
            L.Linear(None, n_actions),
            DiscreteActionValue,
        )
        # Replay buffer that stores transitions separately
        rbuf = replay_buffer.ReplayBuffer(10**6)

    # Draw the computational graph and save it in the output directory.
    fake_obss = np.zeros(env.observation_space.shape, dtype=np.float32)[None]
    if args.recurrent:
        fake_out, _ = q_func(fake_obss, None)
    else:
        fake_out = q_func(fake_obss)
    chainerrl.misc.draw_computational_graph([fake_out],
                                            os.path.join(args.outdir, 'model'))

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    opt = chainer.optimizers.Adam(1e-4, eps=1e-4)
    opt.setup(q_func)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = chainerrl.agents.DoubleDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator='mean',
        phi=phi,
        minibatch_size=args.batch_size,
        episodic_update_len=args.episodic_update_len,
        recurrent=args.recurrent,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.demo_n_episodes,
        )
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.demo_n_episodes,
            eval_stats['mean'],
            eval_stats['median'],
            eval_stats['stdev'],
        ))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
        )
Exemple #27
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='BreakoutNoFrameskip-v4',
                        help='OpenAI Atari domain to perform algorithm on.')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--steps',
                        type=int,
                        default=5 * 10**7,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=5 * 10**4,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--eval-n-steps', type=int, default=125000)
    parser.add_argument('--eval-interval', type=int, default=250000)
    parser.add_argument('--n-best-episodes', type=int, default=30)
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=None),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, 0.05)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions),
                            DiscreteActionValue)

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyperparameters as the Nature paper
    opt = optimizers.RMSpropGraves(lr=2.5e-4,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        start_epsilon=1.0,
        end_epsilon=0.1,
        decay_steps=10**6,
        random_action_func=lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.DQN
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=10**4,
                  clip_delta=True,
                  update_interval=4,
                  batch_accumulator='sum',
                  phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_steps=args.eval_n_steps,
                                                  n_episodes=None)
        print('n_episodes: {} mean: {} median: {} stdev {}'.format(
            eval_stats['episodes'], eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=True,
            eval_env=eval_env,
        )

        dir_of_best_network = os.path.join(args.outdir, "best")
        agent.load(dir_of_best_network)

        # run 30 evaluation episodes, each capped at 5 mins of play
        stats = experiments.evaluator.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.n_best_episodes,
            max_episode_len=4500,
            logger=None)
        with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f:
            json.dump(stats, f)
        print("The results of the best scoring network:")
        for stat in stats:
            print(str(stat) + ":" + str(stats[stat]))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='/tmp/chainerRL_results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--final-exploration-steps', type=int, default=10**4)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--evaluate',
                        action='store_true',
                        default=False,
                        help="Run evaluation mode")
    parser.add_argument('--load',
                        type=str,
                        default=None,
                        help="Load saved_model")
    parser.add_argument('--steps', type=int, default=4 * 10**6)
    parser.add_argument('--prioritized-replay', action='store_true')
    parser.add_argument('--replay-start-size', type=int, default=1000)
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=5 * 10**2)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=1)
    parser.add_argument('--eval-interval',
                        type=int,
                        default=1e4,
                        help="After how many steps to evaluate the agent."
                        "(-1 -> always)")
    parser.add_argument('--n-hidden-channels', type=int, default=20)
    parser.add_argument('--n-hidden-layers', type=int, default=20)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--minibatch-size', type=int, default=None)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1)
    parser.add_argument('--time-step-limit', type=int, default=1e5)
    parser.add_argument('--outdir-time-suffix',
                        choices=['empty', 'none', 'time'],
                        default='empty',
                        type=str.lower)
    parser.add_argument('--checkpoint_frequency',
                        type=int,
                        default=1e3,
                        help="Nuber of steps to checkpoint after")
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='Use debug log-level')
    args = parser.parse_args()
    import logging
    logging.basicConfig(
        level=logging.INFO if not args.verbose else logging.DEBUG)

    # Set a random seed used in ChainerRL ALSO SETS NUMPY SEED!
    misc.set_random_seed(args.seed)

    if args.outdir and not args.load:
        outdir_suffix_dict = {
            'none': '',
            'empty': '',
            'time': '%Y%m%dT%H%M%S.%f'
        }
        args.outdir = experiments.prepare_output_dir(
            args,
            args.outdir,
            argv=sys.argv,
            time_format=outdir_suffix_dict[args.outdir_time_suffix])
    elif args.load:
        if args.load.endswith(os.path.sep):
            args.load = args.load[:-1]
        args.outdir = os.path.dirname(args.load)
        count = 0
        fn = os.path.join(args.outdir.format(count), 'scores_{:>03d}')
        while os.path.exists(fn.format(count)):
            count += 1
        os.rename(os.path.join(args.outdir, 'scores.txt'), fn.format(count))
        if os.path.exists(os.path.join(args.outdir, 'best')):
            os.rename(os.path.join(args.outdir, 'best'),
                      os.path.join(args.outdir, 'best_{:>03d}'.format(count)))

    logging.info('Output files are saved in {}'.format(args.outdir))

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def make_env(test):
        HOST = ''  # The server's hostname or IP address
        PORT = 54321  # The port used by the server
        if test:  # Just such that eval and train env don't have the same port
            PORT += 1

        # TODO don't hardcode env params
        # TODO if we use this solution (i.e. write port to file and read it with FD) we would have to make sure that
        # outdir doesn't append time strings. Otherwise it will get hard to use on the cluster
        env = FDEnvSelHeur(host=HOST,
                           port=PORT,
                           num_heuristics=2,
                           config_dir=args.outdir)
        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if ((args.render_eval and test) or (args.render_train and not test)):
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    # state = env.reset()
    # while True:
    # for x in [1,1,1,1,0,0,0,0]:
    #    state, reward, done, _ = env.step(x)
    #    print(x)
    #    if done:
    #        break

    timestep_limit = args.time_step_limit
    obs_space = env.observation_space
    obs_size = obs_space.low.size
    action_space = env.action_space

    if isinstance(action_space, spaces.Box):  # Usefull if we want to control
        action_size = action_space.low.size  # other continous parameters
        # Use NAF to apply DQN to continuous action spaces
        q_func = q_functions.FCQuadraticStateQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            action_space=action_space)
        # Use the Ornstein-Uhlenbeck process for exploration
        ou_sigma = (action_space.high - action_space.low) * 0.2
        explorer = explorers.AdditiveOU(sigma=ou_sigma)
    else:
        n_actions = action_space.n
        q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size,
            n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        # q_func = FCDuelingDQN(
        #     obs_size, n_actions)
        # Use epsilon-greedy for exploration
        explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    if not args.load:
        chainerrl.misc.draw_computational_graph(
            [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
            os.path.join(args.outdir, 'model'))

    opt = optimizers.Adam(eps=1e-2)
    logging.info('Optimizer: %s', str(opt))
    opt.setup(q_func)
    opt.add_hook(GradientClipping(5))

    rbuf_capacity = 5 * 10**5
    if args.minibatch_size is None:
        args.minibatch_size = 32
        # args.minibatch_size = 16
    if args.prioritized_replay:
        betasteps = (args.steps - args.replay_start_size) \
                    // args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                     betasteps=betasteps)
    else:
        rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    agent = DDQN(
        q_func,
        opt,
        rbuf,
        gamma=args.gamma,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        minibatch_size=args.minibatch_size,
        target_update_method=args.target_update_method,
        soft_update_tau=args.soft_update_tau,
    )
    t_offset = 0
    if args.load:  # Continue training model or load for evaluation
        agent.load(args.load)
        rbuf.load(os.path.join(args.load, 'replay_buffer.pkl'))
        try:
            t_offset = int(os.path.basename(args.load).split('_')[0])
        except TypeError:
            with open(os.path.join(args.load, 't.txt'), 'r') as fh:
                data = fh.readlines()
            t_offset = int(data[0])
        except ValueError:
            t_offset = 0

    eval_env = make_env(test=False)

    if args.evaluate:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        criterion = 'steps'  # can be made an argument if we support any other form of checkpointing
        l = logging.getLogger('Checkpoint_Hook')

        def checkpoint(env, agent, step):
            if criterion == 'steps':
                if step % args.checkpoint_frequency == 0:
                    save_agent_and_replay_buffer(
                        agent,
                        step,
                        args.outdir,
                        suffix='_chkpt',
                        logger=l,
                        chckptfrq=args.checkpoint_frequency)
            else:
                # TODO seems to checkpoint given wall_time we would have to modify the environment such that it tracks
                # time or number of episodes
                raise NotImplementedError

        hooks = [checkpoint]
        experiments.train_agent(agent=agent,
                                env=env,
                                steps=args.steps,
                                outdir=args.outdir,
                                step_hooks=hooks,
                                step_offset=t_offset)
Exemple #29
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='CartPole-v1')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=1000)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--steps', type=int, default=10**8)
    parser.add_argument('--replay-start-size', type=int, default=50)
    parser.add_argument('--target-update-interval', type=int, default=100)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-interval', type=int, default=1000)
    parser.add_argument('--n-hidden-channels', type=int, default=12)
    parser.add_argument('--n-hidden-layers', type=int, default=3)
    parser.add_argument('--gamma', type=float, default=0.95)
    parser.add_argument('--minibatch-size', type=int, default=32)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1.0)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        env = gym.make(args.env)
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if ((args.render_eval and test) or (args.render_train and not test)):
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_size = env.observation_space.low.size
    action_space = env.action_space

    hidden_size = 64
    q_func = chainerrl.agents.iqn.ImplicitQuantileQFunction(
        psi=chainerrl.links.Sequence(
            L.Linear(obs_size, hidden_size),
            F.relu,
        ),
        phi=chainerrl.links.Sequence(
            chainerrl.agents.iqn.CosineBasisLinear(64, hidden_size),
            F.relu,
        ),
        f=L.Linear(hidden_size, env.action_space.n),
    )
    # Use epsilon-greedy for exploration
    explorer = explorers.LinearDecayEpsilonGreedy(args.start_epsilon,
                                                  args.end_epsilon,
                                                  args.final_exploration_steps,
                                                  action_space.sample)

    opt = optimizers.Adam(1e-3)
    opt.setup(q_func)

    rbuf_capacity = 50000  # 5 * 10 ** 5
    rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    agent = chainerrl.agents.IQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        minibatch_size=args.minibatch_size,
    )

    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
            max_episode_len=timestep_limit)
Exemple #30
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False,
                        help='Evaluate the agent without training.')
    parser.add_argument('--load', type=str, default=None,
                        help='Load a saved agent from a given directory.')
    parser.add_argument('--final-exploration-steps',
                        type=int, default=5 * 10 ** 5,
                        help='Timesteps after which we stop'
                             ' annealing exploration rate')
    parser.add_argument('--final-epsilon', type=float, default=0.2,
                        help='Final value of epsilon during training.')
    parser.add_argument('--steps', type=int, default=2 * 10 ** 6,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--replay-start-size', type=int, default=5 * 10 ** 4,
                        help='Minimum replay buffer size before'
                             ' performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int, default=1 * 10 ** 4,
                        help='Frequency (in timesteps) at which'
                             ' the target network is updated.')
    parser.add_argument('--eval-interval', type=int, default=10 ** 5,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval', type=int, default=1,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--eval-n-runs', type=int, default=100,
                        help='Number of episodes used for evaluation.')
    parser.add_argument('--logging-level', type=int, default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render', action='store_true', default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--lr', type=float, default=6.25e-5,
                        help='Learning rate')
    parser.add_argument('--num-envs', type=int, default=1,
                        help='Number of envs run in parallel.')
    parser.add_argument('--batch-size', type=int, default=32,
                        help='Batch size used for training.')
    parser.add_argument('--record', action='store_true', default=False,
                        help='Record videos of evaluation envs.'
                             ' --render should also be specified.')
    parser.add_argument('--gamma', type=float, default=0.99,
                        help='Discount factor.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2 ** 32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    max_episode_steps = 8

    def make_env(idx, test):
        from pybullet_envs.bullet.kuka_diverse_object_gym_env import KukaDiverseObjectEnv  # NOQA
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[idx])
        env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
        # Set a random seed for this subprocess
        misc.set_random_seed(env_seed)
        env = KukaDiverseObjectEnv(
            isDiscrete=True,
            renders=args.render and (args.demo or not test),
            height=84,
            width=84,
            maxSteps=max_episode_steps,
            isTest=test,
        )
        # (84, 84, 3) -> (3, 84, 84)
        env = TransposeObservation(env, (2, 0, 1))
        env = ObserveElapsedSteps(env, max_episode_steps)
        # KukaDiverseObjectEnv internally asserts int actions and does not
        # accept python-future's newint.
        env = CastAction(env, __builtins__.int)
        env.seed(int(env_seed))
        if test and args.record:
            assert args.render,\
                'To use --record, --render needs be specified.'
            video_dir = os.path.join(args.outdir, 'video_{}'.format(idx))
            os.mkdir(video_dir)
            env = RecordMovie(env, video_dir)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv(
            [functools.partial(make_env, idx, test)
                for idx in range(args.num_envs)])

    eval_env = make_batch_env(test=True)
    n_actions = eval_env.action_space.n

    q_func = GraspingQFunction(n_actions, max_episode_steps)

    # Draw the computational graph and save it in the output directory.
    fake_obs = (
        np.zeros((3, 84, 84), dtype=np.float32)[None],
        np.zeros((), dtype=np.int32)[None],
    )
    chainerrl.misc.draw_computational_graph(
        [q_func(fake_obs)],
        os.path.join(args.outdir, 'model'))

    # Use the hyper parameters of the Nature paper
    opt = optimizers.RMSpropGraves(
        lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2)

    opt.setup(q_func)

    # Anneal beta from beta0 to 1 throughout training
    betasteps = args.steps / args.update_interval
    rbuf = replay_buffer.PrioritizedReplayBuffer(
        10 ** 6, alpha=0.6, beta0=0.4, betasteps=betasteps)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon,
        args.final_exploration_steps,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        image, elapsed_steps = x
        # Normalize RGB values: [0, 255] -> [0, 1]
        norm_image = np.asarray(image, dtype=np.float32) / 255
        return norm_image, elapsed_steps

    agent = chainerrl.agents.DoubleDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=explorer,
        minibatch_size=args.batch_size,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator='sum',
        phi=phi,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(test=False),
            eval_env=eval_env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            log_interval=1000,
        )