コード例 #1
0
    def test_sequential(self):
        model = chainer.Sequential(
            chainer.links.Linear(3),
            chainer.functions.relu,
            chainer.links.Linear(4),
        )
        self.assertEqual(
            names_of_links(model),
            {'/0', '/1'}
        )
        self.assertIs(model._layers[1], chainer.functions.relu)
        to_factorized_noisy(model)
        self.assertEqual(
            names_of_links(model),
            {
                '/0', '/0/mu', '/0/sigma',
                '/1', '/1/mu', '/1/sigma',
            })
        self.assertIs(model._layers[1], chainer.functions.relu)
        model(numpy.ones((2, 3), numpy.float32))

        # assert new parameters are used
        y = model(numpy.ones((2, 3), numpy.float32))
        chainer.functions.sum(y).backward()
        for p in model.params():
            self.assertIsNotNone(p.grad)
コード例 #2
0
    def _test_load_rainbow(self, gpu):
        q_func = DistributionalDuelingDQN(4, 51, -10, 10)
        links.to_factorized_noisy(q_func, sigma_scale=0.5)
        explorer = explorers.Greedy()
        opt = chainer.optimizers.Adam(6.25e-5, eps=1.5 * 10**-4)
        opt.setup(q_func)
        rbuf = replay_buffer.ReplayBuffer(100)
        agent = agents.CategoricalDoubleDQN(
            q_func,
            opt,
            rbuf,
            gpu=gpu,
            gamma=0.99,
            explorer=explorer,
            minibatch_size=32,
            replay_start_size=50,
            target_update_interval=32000,
            update_interval=4,
            batch_accumulator='mean',
            phi=lambda x: x,
        )

        model, exists = download_model("Rainbow",
                                       "BreakoutNoFrameskip-v4",
                                       model_type=self.pretrained_type)
        agent.load(model)
        if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'):
            assert exists
    def dqn_q_values_and_neuronal_net(self, args, action_space, obs_size,
                                      obs_space):
        """
        learning process
        """

        if isinstance(action_space, spaces.Box):
            action_size = action_space.low.size
            # Use NAF to apply DQN to continuous action spaces
            q_func = q_functions.FCQuadraticStateQFunction(
                obs_size,
                action_size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
                action_space=action_space)
            # Use the Ornstein-Uhlenbeck process for exploration
            ou_sigma = (action_space.high - action_space.low) * 0.2
            explorer = explorers.AdditiveOU(sigma=ou_sigma)
        else:
            n_actions = action_space.n
            # print("n_actions: ", n_actions)
            q_func = q_functions.FCStateQFunctionWithDiscreteAction(
                obs_size,
                n_actions,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers)
            # print("q_func ", q_func)
            # Use epsilon-greedy for exploration
            explorer = explorers.LinearDecayEpsilonGreedy(
                args.start_epsilon, args.end_epsilon,
                args.final_exploration_steps, action_space.sample)
            # print("explorer: ", explorer)

        if args.noisy_net_sigma is not None:
            links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
            # Turn off explorer
            explorer = explorers.Greedy()
        # print("obs_space.low : ", obs_space.shape)
        chainerrl.misc.draw_computational_graph(
            [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
            os.path.join(args.outdir, 'model'))

        opt = optimizers.Adam()
        opt.setup(q_func)

        rbuf_capacity = 5 * 10**5
        if args.minibatch_size is None:
            args.minibatch_size = 32
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                        // args.update_interval
            rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                         betasteps=betasteps)
        else:
            rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

        return q_func, opt, rbuf, explorer
コード例 #4
0
    def test_chain(self):
        ch = chainer.Chain()
        with ch.init_scope():
            ch.l1 = chainer.links.Linear(3, 4)
            ch.l2 = chainer.links.Linear(5)
            ch.l3 = chainer.links.PReLU()
        self.assertEqual(names_of_links(ch), {'/l1', '/l2', '/l3'})

        to_factorized_noisy(ch)
        self.assertEqual(names_of_links(ch), {
            '/l1', '/l1/mu', '/l1/sigma', '/l2', '/l2/mu', '/l2/sigma', '/l3'
        })
コード例 #5
0
    def test_chainlist(self):
        ch = chainer.ChainList(
            chainer.links.Linear(3, 4),
            chainer.links.Linear(5),
            chainer.links.PReLU(),
        )
        self.assertEqual(names_of_links(ch), {'/0', '/1', '/2'})

        to_factorized_noisy(ch)
        self.assertEqual(
            names_of_links(ch),
            {'/0', '/0/mu', '/0/sigma', '/1', '/1/mu', '/1/sigma', '/2'})
コード例 #6
0
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if(type(args) is list):
        args=make_args(args)

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu,))
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    print('Output files are saved in {}'.format(args.outdir))

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def make_env(test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)

        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if ((args.render_eval and test) or
                (args.render_train and not test)):
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = env.observation_space
    obs_size = obs_space.low.size
    action_space = env.action_space

    if isinstance(action_space, spaces.Box):
        print("Use NAF to apply DQN to continuous action spaces")
        action_size = action_space.low.size
        # Use NAF to apply DQN to continuous action spaces
        q_func = q_functions.FCQuadraticStateQFunction(
            obs_size, action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            action_space=action_space)
        # Use the Ornstein-Uhlenbeck process for exploration
        ou_sigma = (action_space.high - action_space.low) * 0.2
        explorer = explorers.AdditiveOU(sigma=ou_sigma)
    else:
        print("not continuous action spaces")
        n_actions = action_space.n
        q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size, n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        # Use epsilon-greedy for exploration
        explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    opt = optimizers.Adam()
    opt.setup(q_func)

    rbuf_capacity = 5 * 10 ** 5
    if args.minibatch_size is None:
        args.minibatch_size = 32
    if args.prioritized_replay:
        betasteps = (args.steps - args.replay_start_size) \
            // args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(
            rbuf_capacity, betasteps=betasteps)
    else:
        rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    agent = DoubleDQN(q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma,
                explorer=explorer, replay_start_size=args.replay_start_size,
                target_update_interval=args.target_update_interval,
                update_interval=args.update_interval,
                minibatch_size=args.minibatch_size,
                target_update_method=args.target_update_method,
                soft_update_tau=args.soft_update_tau,
                )

    if args.load_agent:
        agent.load(args.load_agent)

    eval_env = make_env(test=True)

    if (args.mode=='train'):
        experiments.train_agent_with_evaluation(
            agent=agent, env=env, steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval,
            outdir=args.outdir, eval_env=eval_env,
            step_offset=args.step_offset,
            checkpoint_freq=args.checkpoint_freq,
            train_max_episode_len=timestep_limit,
            log_type=args.log_type
            )
    elif (args.mode=='check'):
        from matplotlib import animation
        import matplotlib.pyplot as plt
        
        frames = []
        for i in range(3):
            obs = env.reset()
            done = False
            R = 0
            t = 0
            while not done and t < 200:
                frames.append(env.render(mode = 'rgb_array'))
                action = agent.act(obs)
                obs, r, done, _ = env.step(action)
                R += r
                t += 1
            print('test episode:', i, 'R:', R)
            agent.stop_episode()
        env.close()

        from IPython.display import HTML
        plt.figure(figsize=(frames[0].shape[1]/72.0, frames[0].shape[0]/72.0),dpi=72)
        patch = plt.imshow(frames[0])
        plt.axis('off') 
        def animate(i):
            patch.set_data(frames[i])
        anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames),interval=50)
        anim.save(args.save_mp4)
        return anim
コード例 #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--outdir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true', default=False)
    parser.add_argument('--eval-epsilon', type=float, default=0.0)
    parser.add_argument('--noisy-net-sigma', type=float, default=0.5)
    parser.add_argument('--steps', type=int, default=5 * 10 ** 7)
    parser.add_argument('--max-frames', type=int,
                        default=30 * 60 * 60,  # 30 minutes with 60 fps
                        help='Maximum number of frames for each episode.')
    parser.add_argument('--replay-start-size', type=int, default=2 * 10 ** 4)
    parser.add_argument('--eval-n-steps', type=int, default=125000)
    parser.add_argument('--eval-interval', type=int, default=250000)
    parser.add_argument('--logging-level', type=int, default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render', action='store_true', default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor', action='store_true', default=False,
                        help='Monitor env. Videos and additional information'
                             ' are saved as output files.')
    parser.add_argument('--n-best-episodes', type=int, default=200)
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2 ** 31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir,
                mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n

    n_atoms = 51
    v_max = 10
    v_min = -10
    q_func = DistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max,)

    # Noisy nets
    links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
    # Turn off explorer
    explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as https://arxiv.org/abs/1707.06887
    opt = chainer.optimizers.Adam(6.25e-5, eps=1.5 * 10 ** -4)
    opt.setup(q_func)

    # Prioritized Replay
    # Anneal beta from beta0 to 1 throughout training
    update_interval = 4
    betasteps = args.steps / update_interval
    rbuf = replay_buffer.PrioritizedReplayBuffer(
        10 ** 6, alpha=0.5, beta0=0.4, betasteps=betasteps,
        num_steps=3,
        normalize_by_max='memory',
    )

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.CategoricalDoubleDQN
    agent = Agent(
        q_func, opt, rbuf, gpu=args.gpu, gamma=0.99,
        explorer=explorer, minibatch_size=32,
        replay_start_size=args.replay_start_size,
        target_update_interval=32000,
        update_interval=update_interval,
        batch_accumulator='mean',
        phi=phi,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=args.eval_n_steps,
            n_episodes=None)
        print('n_episodes: {} mean: {} median: {} stdev {}'.format(
            eval_stats['episodes'],
            eval_stats['mean'],
            eval_stats['median'],
            eval_stats['stdev']))

    else:
        experiments.train_agent_with_evaluation(
            agent=agent, env=env, steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=True,
            eval_env=eval_env,
        )

        dir_of_best_network = os.path.join(args.outdir, "best")
        agent.load(dir_of_best_network)

        # run 200 evaluation episodes, each capped at 30 mins of play
        stats = experiments.evaluator.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.n_best_episodes,
            max_episode_len=args.max_frames/4,
            logger=None)
        with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f:
            # temporary hack to handle python 2/3 support issues.
            # json dumps does not support non-string literal dict keys
            json_stats = json.dumps(stats)
            print(str(json_stats), file=f)
        print("The results of the best scoring network:")
        for stat in stats:
            print(str(stat) + ":" + str(stats[stat]))
コード例 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='MarLo-FindTheGoal-v0',
                        help='Marlo env to perform algorithm on.')
    parser.add_argument('--out_dir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=10**6,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon',
                        type=float,
                        default=0.01,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon',
                        type=float,
                        default=0.001,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--arch',
                        type=str,
                        default='nature',
                        choices=['nature', 'nips', 'dueling', 'doubledqn'],
                        help='Network architecture to use.')
    parser.add_argument('--steps',
                        type=int,
                        default=5 * 10**7,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument(
        '--max-episode-len',
        type=int,
        default=30 * 60 * 60 // 4,  # 30 minutes with 60/4 fps
        help='Maximum number of timesteps for each episode.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=5 * 10**4,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=3 * 10**4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--eval-interval',
                        type=int,
                        default=10**5,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval',
                        type=int,
                        default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--agent',
                        type=str,
                        default='DQN',
                        choices=['DQN', 'DoubleDQN', 'PAL'])
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--lr',
                        type=float,
                        default=2.5e-4,
                        help='Learning rate.')
    parser.add_argument('--prioritized',
                        action='store_true',
                        default=False,
                        help='Use prioritized experience replay.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    print('Output files are saved in {}'.format(args.out_dir))

    env = make_env(args.env, env_seed=args.seed, demo=args.demo)

    n_actions = env.action_space.n

    q_func = parse_arch(args.arch, n_actions)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Use the Nature paper's hyperparameters
    opt = optimizers.RMSpropGraves(lr=args.lr,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    # Select a replay buffer to use
    if args.prioritized:
        # Anneal beta from beta0 to 1 throughout training
        betasteps = args.steps / args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(10**6,
                                                     alpha=0.6,
                                                     beta0=0.4,
                                                     betasteps=betasteps)
    else:
        rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        x = x.transpose(2, 0, 1)
        return np.asarray(x, dtype=np.float32) / 255

    Agent = parse_agent(args.agent)
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=args.target_update_interval,
                  update_interval=args.update_interval,
                  batch_accumulator='sum',
                  phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.out_dir,
            save_best_so_far_agent=False,
            max_episode_len=args.max_episode_len,
            eval_env=env,
        )
コード例 #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true', default=False)
    parser.add_argument('--final-exploration-frames', type=int, default=10**6)
    parser.add_argument('--final-epsilon', type=float, default=0.1)
    parser.add_argument('--eval-epsilon', type=float, default=0.05)
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--arch',
                        type=str,
                        default='plain',
                        choices=['plain', 'dueling'],
                        help='Network architecture to use.')
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--replay-start-size', type=int, default=5 * 10**4)
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=3.2 * 10**4)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--update-interval', type=int, default=4)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--num-step-return', type=int, default=1)
    parser.add_argument('--agent',
                        type=str,
                        default='CDQN',
                        choices=['CDQN', 'DoubleCDQN'])
    parser.add_argument('--batch-size', type=int, default=32)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--prioritized',
                        action='store_true',
                        default=False,
                        help='Use prioritized experience replay.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n

    n_atoms = 51
    v_max = 10
    v_min = -10
    q_func = parse_arch(args.arch, n_actions, n_atoms, v_min, v_max)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func)
        # Turn off explorer
        explorer = explorers.Greedy()
    else:
        explorer = explorers.LinearDecayEpsilonGreedy(
            1.0, args.final_epsilon, args.final_exploration_frames,
            lambda: np.random.randint(n_actions))

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as https://arxiv.org/abs/1707.06887
    opt = chainer.optimizers.Adam(6.25e-5, eps=1.5 * 10**-4)
    opt.setup(q_func)

    # Select a replay buffer to use
    if args.prioritized:
        # Anneal beta from beta0 to 1 throughout training
        betasteps = args.steps / args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(
            10**6,
            alpha=0.5,
            beta0=0.4,
            betasteps=betasteps,
            num_steps=args.num_step_return)
    else:
        rbuf = replay_buffer.ReplayBuffer(10**6, args.num_step_return)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = parse_agent(args.agent)
    agent = Agent(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        minibatch_size=args.batch_size,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator='mean',
        phi=phi,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_steps=None,
                                                  n_episodes=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            eval_env=eval_env,
        )
コード例 #10
0
ファイル: Train_DQN.py プロジェクト: sjxh92/DeepNSM
    def main(self):
        import logging
        logging.basicConfig(level=logging.INFO)

        # Set a random seed used in ChainerRL
        misc.set_random_seed(args.seed, gpus=(args.gpu, ))

        args.outdir = experiments.prepare_output_dir(args,
                                                     args.outdir,
                                                     argv=sys.argv)
        print('Output files are saved in {}'.format(args.outdir))

        env = self.env_make(test=False)
        timestep_limit = env.total_time
        obs_size = env.observation.size
        action_space = env.action_space

        # Q function
        n_actions = action_space.n
        q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size,
            n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        # Use epsilon-greedy for exploration
        explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

        if args.noisy_net_sigma is not None:
            links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
            # Turn off explorer
            explorer = explorers.Greedy()

        # Draw the computational graph and save it in the output directory.
        # chainerrl.misc.draw_computational_graph([q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
        #                                        os.path.join(args.outdir, 'model'))

        opt = optimizers.Adam()
        opt.setup(q_func)

        rbuf = self.buffer()

        agent = DQN(q_func,
                    opt,
                    rbuf,
                    gamma=args.gamma,
                    explorer=explorer,
                    replay_start_size=args.replay_start_size,
                    target_update_interval=args.target_update_interval,
                    update_interval=args.update_interval,
                    minibatch_size=args.minibatch_size,
                    target_update_method=args.target_update_method,
                    soft_update_tau=args.soft_update_tau)
        if args.load:
            agent.load(args.load)

        eval_env = self.env_make(test=True)

        if args.demo:
            eval_stats = experiments.eval_performance(
                env=eval_env,
                agent=agent,
                n_steps=None,
                n_episodes=args.eval_n_runs,
                max_episode_len=timestep_limit)
            print('n_runs: {} mean: {} median: {} stdev: {}'.format(
                args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
                eval_stats['stdev']))
        else:
            experiments.train_agent_with_evaluation(
                agent=agent,
                env=env,
                steps=args.steps,
                eval_n_steps=None,
                eval_n_episodes=args.eval_n_runs,
                eval_interval=args.eval_interval,
                outdir=args.outdir,
                eval_env=eval_env,
                train_max_episode_len=timestep_limit)
        pass
コード例 #11
0
ファイル: dqn.py プロジェクト: yue123161/DQEAF
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--seed', type=int, default=123,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu', type=int, default=-1)
    parser.add_argument('--final-exploration-steps',
                        type=int, default=10 ** 4)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--steps', type=int, default=50000)
    parser.add_argument('--prioritized-replay', action='store_true', default=False)
    parser.add_argument('--episodic-replay', action='store_true', default=False)
    parser.add_argument('--replay-start-size', type=int, default=1000)
    parser.add_argument('--target-update-interval', type=int, default=10 ** 2)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=50)
    parser.add_argument('--eval-interval', type=int, default=10 ** 3)
    parser.add_argument('--n-hidden-channels', type=int, default=512)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--minibatch-size', type=int, default=None)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--monitor', action='store_true', default=True)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-3)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(
        args, args.outdir, argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        ENV_NAME = 'malware-test-v0' if test else 'malware-v0'
        env = gym.make(ENV_NAME)
        # Use different random seeds for train and test envs
        env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        # if not test:
        #     misc.env_modifiers.make_reward_filtered(
        #         env, lambda x: x * args.reward_scale_factor)
        if ((args.render_eval and test) or
                (args.render_train and not test)):
            misc.env_modifiers.make_rendered(env)
        return env

    env = make_env(test=False)
    timestep_limit = 80
    obs_space = env.observation_space
    obs_size = obs_space.shape[0]
    action_space = env.action_space

    n_actions = action_space.n
    q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size, n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
    if args.gpu >= 0:
        q_func.to_gpu(args.gpu)

    # Use epsilon-greedy for exploration
    explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    if args.gpu < 0:
        chainerrl.misc.draw_computational_graph(
            [q_func(np.zeros_like(obs_space, dtype=np.float32)[None])],
            os.path.join(args.outdir, 'model'))

    opt = optimizers.Adam()
    opt.setup(q_func)

    rbuf_capacity = 5 * 10 ** 5
    if args.episodic_replay:
        if args.minibatch_size is None:
            args.minibatch_size = 4
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                        // args.update_interval
            rbuf = replay_buffer.PrioritizedEpisodicReplayBuffer(
                rbuf_capacity, betasteps=betasteps)
        else:
            rbuf = replay_buffer.EpisodicReplayBuffer(rbuf_capacity)
    else:
        if args.minibatch_size is None:
            args.minibatch_size = 32
        if args.prioritized_replay:
            betasteps = (args.steps - args.replay_start_size) \
                        // args.update_interval
            rbuf = replay_buffer.PrioritizedReplayBuffer(
                rbuf_capacity, betasteps=betasteps)
        else:
            rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    def phi(obs):
        return obs.astype(np.float32)

    agent = DoubleDQN(q_func, opt, rbuf, gamma=args.gamma,
                      explorer=explorer, replay_start_size=args.replay_start_size,
                      target_update_interval=args.target_update_interval,
                      update_interval=args.update_interval,
                      phi=phi, minibatch_size=args.minibatch_size,
                      target_update_method=args.target_update_method,
                      soft_update_tau=args.soft_update_tau,
                      episodic_update=args.episodic_replay, episodic_update_len=16)

    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        q_hook = PlotHook('Average Q Value')
        loss_hook = PlotHook('Average Loss', plot_index=1)

        experiments.train_agent_with_evaluation(
            agent=agent, env=env, steps=args.steps,
            eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval,
            outdir=args.outdir, eval_env=eval_env,
            max_episode_len=timestep_limit,
            step_hooks=[q_hook, loss_hook],
            successful_score=7
        )
コード例 #12
0
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if (type(args) is list):
        args = make_args(args)
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    q_func = parse_arch(args.arch, n_actions)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
        # Turn off explorer
        explorer = explorers.Greedy()
    else:
        explorer = explorers.LinearDecayEpsilonGreedy(
            1.0, args.final_epsilon, args.final_exploration_frames,
            lambda: np.random.randint(n_actions))

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the Nature paper's hyperparameters
    opt = optimizers.RMSpropGraves(lr=args.lr,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    # Select a replay buffer to use
    if args.prioritized:
        # Anneal beta from beta0 to 1 throughout training
        betasteps = args.steps / args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(
            10**6,
            alpha=0.6,
            beta0=0.4,
            betasteps=betasteps,
            num_steps=args.num_step_return)
    else:
        rbuf = replay_buffer.ReplayBuffer(10**6, args.num_step_return)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = parse_agent(args.agent)
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=args.target_update_interval,
                  clip_delta=args.clip_delta,
                  update_interval=args.update_interval,
                  batch_accumulator='sum',
                  phi=phi)

    if args.load_agent:
        agent.load(args.load_agent)

    if (args.mode == 'train'):
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_env=eval_env,
            checkpoint_freq=args.checkpoint_frequency,
            step_offset=args.step_offset,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            log_type=args.log_type)
    elif (args.mode == 'check'):
        return tools.make_video.check(env=env,
                                      agent=agent,
                                      save_mp4=args.save_mp4)

    elif (args.mode == 'growth'):
        return tools.make_video.growth(env=env,
                                       agent=agent,
                                       outdir=args.outdir,
                                       max_num=args.max_frames,
                                       save_mp4=args.save_mp4)
コード例 #13
0
    def __init__(self, alg, env, model_path):
        self.alg = alg
        seed = 0
        n_actions = gym.make(env).action_space.n
        gpus = [-1]
        gpu = None
        misc.set_random_seed(seed, gpus=gpus)
        if alg == "DQN-C":
            model = links.Sequence(
                links.NatureDQNHead(),
                L.Linear(512, n_actions),
                DiscreteActionValue)
        if alg == "PPO":
            winit_last = chainer.initializers.LeCunNormal(1e-2)
            model = chainer.Sequential(
                L.Convolution2D(None, 32, 8, stride=4),
                F.relu,
                L.Convolution2D(None, 64, 4, stride=2),
                F.relu,
                L.Convolution2D(None, 64, 3, stride=1),
                F.relu,
                L.Linear(None, 512),
                F.relu,
                links.Branched(
                    chainer.Sequential(
                        L.Linear(None, n_actions, initialW=winit_last),
                        SoftmaxDistribution,
                    ),
                    L.Linear(None, 1),
                )
            )
        if alg == "C51":
            n_atoms = 51
            v_max = 10
            v_min = -10
            model = links.Sequence(
                links.NatureDQNHead(),
                DistributionalFCStateQFunctionWithDiscreteAction(
                    None, n_actions, n_atoms, v_min, v_max,
                    n_hidden_channels=0, n_hidden_layers=0),
            )
        if alg == "ACER":
            model = agents.acer.ACERSharedModel(
                shared=links.Sequence(
                    links.NIPSDQNHead(),
                    L.LSTM(256, 256)),
                pi=links.Sequence(
                    L.Linear(256, n_actions),
                    SoftmaxDistribution),
                q=links.Sequence(
                    L.Linear(256, n_actions),
                    DiscreteActionValue),
            )
        if alg == "A3C":
            model = A3CFF(n_actions)
        if alg == "Rainbow":
            n_atoms = 51
            v_max = 10
            v_min = -10
            model = DistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max)
            links.to_factorized_noisy(model, sigma_scale=0.5)
        if alg == "IQN":
            model = agents.iqn.ImplicitQuantileQFunction(
                psi=chainerrl.links.Sequence(
                    L.Convolution2D(None, 32, 8, stride=4),
                    F.relu,
                    L.Convolution2D(None, 64, 4, stride=2),
                    F.relu,
                    L.Convolution2D(None, 64, 3, stride=1),
                    F.relu,
                    functools.partial(F.reshape, shape=(-1, 3136)),
                ),
                phi=chainerrl.links.Sequence(
                    chainerrl.agents.iqn.CosineBasisLinear(64, 3136),
                    F.relu,
                ),
                f=chainerrl.links.Sequence(
                    L.Linear(None, 512),
                    F.relu,
                    L.Linear(None, n_actions),
                ),
            )
        if alg in ["A3C"]:
            fake_obs = chainer.Variable(
                np.zeros((4, 84, 84), dtype=np.float32)[None],
                name='observation')
            with chainerrl.recurrent.state_reset(model):
                # The state of the model is reset again after drawing the graph
                variables = misc.collect_variables([model(fake_obs)])
                chainer.computational_graph.build_computational_graph(variables)
        elif alg in ["Rainbow", "DQN-C", "C51", "ACER", "PPO"]:
            variables = misc.collect_variables([model(np.zeros((4, 84, 84), dtype=np.float32)[None])])
            chainer.computational_graph.build_computational_graph(variables)
        else:
            fake_obs = np.zeros((4, 84, 84), dtype=np.float32)[None]
            fake_taus = np.zeros(32, dtype=np.float32)[None]
            variables = misc.collect_variables([model(fake_obs)(fake_taus)])

        def phi(x):
            # Feature extractor
            return np.asarray(x, dtype=np.float32) / 255

        opt = optimizers.RMSpropGraves()
        opt.setup(model)
        rbuf = replay_buffer.ReplayBuffer(1)
        if alg == "IQN":
            self.agent = agents.IQN(model, opt, rbuf, gpu=gpu, gamma=0.99, act_deterministically=True, explorer=None,
                                    replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True,
                                    update_interval=4, phi=phi)
        if alg == "A3C":
            self.agent = a3c.A3C(model, opt, t_max=5, gamma=0.99, phi=phi, act_deterministically=True)
        if alg == "Rainbow":
            self.agent = agents.CategoricalDoubleDQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None,
                                                     replay_start_size=1, minibatch_size=1, target_update_interval=None,
                                                     clip_delta=True, update_interval=4, phi=phi)
        if alg == "DQN-C":
            self.agent = agents.DQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None, replay_start_size=1,
                                    minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4,
                                    phi=phi)
        if alg == "C51":
            self.agent = agents.CategoricalDQN(
                model, opt, rbuf, gpu=gpu, gamma=0.99,
                explorer=None, replay_start_size=1,
                minibatch_size=1,
                target_update_interval=None,
                clip_delta=True,
                update_interval=4,
                phi=phi,
            )
        if alg == "ACER":
            self.agent = agents.acer.ACER(model, opt, t_max=5, gamma=0.99,
                                          replay_buffer=rbuf,
                                          n_times_replay=4,
                                          replay_start_size=1,
                                          act_deterministically=True,
                                          phi=phi
                                          )
        if alg == "PPO":
            self.agent = agents.PPO(model, opt, gpu=gpu, phi=phi, update_interval=4, minibatch_size=1, clip_eps=0.1,
                                    recurrent=False, act_deterministically=True)
        self.agent.load(os.path.join(model_path, 'chainer', alg, env.replace("NoFrameskip-v4", ""), 'final'))
コード例 #14
0
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if(type(args) is list):
        args=make_args(args)
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    print('Output files are saved in {}'.format(args.outdir))

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def make_env(test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if ((args.render_eval and test) or
                (args.render_train and not test)):
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = env.observation_space
    obs_size = obs_space.low.size
    action_space = env.action_space

    if isinstance(action_space, spaces.Box):
        print("Use NAF to apply DQN to continuous action spaces")
        action_size = action_space.low.size
        # Use NAF to apply DQN to continuous action spaces
        q_func = q_functions.FCQuadraticStateQFunction(
            obs_size, action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            action_space=action_space)
        # Use the Ornstein-Uhlenbeck process for exploration
        ou_sigma = (action_space.high - action_space.low) * 0.2
        explorer = explorers.AdditiveOU(sigma=ou_sigma)
    else:
        print("not continuous action spaces")
        n_actions = action_space.n
        q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size, n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        # Use epsilon-greedy for exploration
        explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    opt = optimizers.Adam()
    opt.setup(q_func)

    rbuf_capacity = 5 * 10 ** 5
    if args.minibatch_size is None:
        args.minibatch_size = 32
    if args.prioritized_replay:
        betasteps = (args.steps - args.replay_start_size) \
            // args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(
            rbuf_capacity, betasteps=betasteps)
    else:
        rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    agent = DQN(q_func, opt, rbuf, gpu=args.gpu, gamma=args.gamma,
                explorer=explorer, replay_start_size=args.replay_start_size,
                target_update_interval=args.target_update_interval,
                update_interval=args.update_interval,
                minibatch_size=args.minibatch_size,
                target_update_method=args.target_update_method,
                soft_update_tau=args.soft_update_tau,
                )

    if args.load_agent:
        agent.load(args.load_agent)

    eval_env = make_env(test=True)

    if (args.mode=='train'):
        experiments.train_agent_with_evaluation(
            agent=agent, env=env, steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval,
            outdir=args.outdir, eval_env=eval_env,
            step_offset=args.step_offset,
            checkpoint_freq=args.checkpoint_freq,
            train_max_episode_len=args.max_episode_len,
            log_type=args.log_type
            )
    elif (args.mode=='check'):
        return tools.make_video.check(env=env,agent=agent,save_mp4=args.save_mp4)

    elif (args.mode=='growth'):
        return tools.make_video.growth(env=env,agent=agent,outdir=args.outdir,max_num=args.max_episode_len,save_mp4=args.save_mp4)
コード例 #15
0
    def __init__(self, config: Config):
        print('start to init rainbow')
        self.config = config
        self.name = config.name
        self.hyperparameters = config.hyperparameters

        self.stat_logger: Logger = Logger(
            config,
            log_interval=config.log_interval *\
                         (1 + self.hyperparameters['parallel_env_num'] * int(self.hyperparameters['use_parallel_envs'])),
        )
        if self.hyperparameters['use_parallel_envs']:
            self.env = SubprocVecEnv_tf2(
                [
                    config.environment_make_function
                    for _ in range(self.hyperparameters['parallel_env_num'])
                ],
                state_flatter=None,
            )
        else:
            self.env = config.environment_make_function()

        self.test_env = config.test_environment_make_function()

        # function to prepare row observation to chainer format
        print(f"rainbow mode : {self.config.mode}")

        n_actions = self.test_env.action_space.n

        n_atoms = 51
        v_max = 10
        v_min = -10
        q_func = DistributionalDuelingDQN_VectorPicture(
            config.phi(self.test_env.reset()).shape,
            n_actions,
            n_atoms,
            v_min,
            v_max,
        )

        # Noisy nets
        links.to_factorized_noisy(
            q_func, sigma_scale=self.hyperparameters['noisy_net_sigma'])
        # Turn off explorer
        explorer = explorers.Greedy()

        # Draw the computational graph and save it in the output directory.
        # chainerrl.misc.draw_computational_graph(
        #     [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        #     os.path.join(args.outdir, 'model'))

        # Use the same hyper parameters as https://arxiv.org/abs/1707.06887
        opt = chainer.optimizers.Adam(self.hyperparameters['lr'],
                                      eps=1.5 * 10**-4)
        opt.setup(q_func)

        # Prioritized Replay
        # Anneal beta from beta0 to 1 throughout training
        update_interval = 4
        betasteps = self.config.env_steps_to_run / update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(
            10**6,
            alpha=0.5,
            beta0=0.4,
            betasteps=betasteps,
            num_steps=3,
            normalize_by_max='memory',
        )

        self.agent = agents.CategoricalDoubleDQN(
            q_func,
            opt,
            rbuf,
            gpu=self.config.rainbow_gpu,
            gamma=0.99,
            explorer=explorer,
            minibatch_size=32,
            replay_start_size=self.hyperparameters['replay_start_size'],
            target_update_interval=16000,
            update_interval=update_interval,
            batch_accumulator='mean',
            phi=config.phi,
        )

        # self.folder_save_path = os.path.join('model_saves', 'Rainbow', self.name)
        self.episode_number = 0
        self.global_step_number = 0
        self.batch_step_number = 0
        self._total_grad_steps = 0
        self.current_game_stats = None
        self.flush_stats()
        # self.tf_writer = config.tf_writer

        self.accumulated_reward_mean = None
        self.accumulated_reward_std = None

        self._exp_moving_track_progress = 0.0
コード例 #16
0
eval_env = make_env(0, test=True)

action_size = env.action_space.n

n_atoms = 51
v_max = 10
v_min = -10

q_func = DistributionalDuelingDQN(action_size, n_atoms, v_min, v_max)

gpu_device = GPU_DEVICE
if GPU_DEVICE == 0:
    chainer.cuda.get_device(gpu_device).use()
    q_func.to_gpu(gpu_device)

links.to_factorized_noisy(q_func, sigma_scale=0.5)

explorer = explorers.Greedy()

opt = chainer.optimizers.Adam(6.25e-5, eps=1.5 * 10**-4)
opt.setup(q_func)

update_interval = 4

betasteps = STEPS / update_interval

rbuf = replay_buffer.PrioritizedReplayBuffer(10**6,
                                             alpha=0.5,
                                             beta0=0.4,
                                             betasteps=betasteps,
                                             num_steps=3)
コード例 #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='/tmp/chainerRL_results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--final-exploration-steps', type=int, default=10**4)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--evaluate',
                        action='store_true',
                        default=False,
                        help="Run evaluation mode")
    parser.add_argument('--load',
                        type=str,
                        default=None,
                        help="Load saved_model")
    parser.add_argument('--steps', type=int, default=4 * 10**6)
    parser.add_argument('--prioritized-replay', action='store_true')
    parser.add_argument('--replay-start-size', type=int, default=1000)
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=5 * 10**2)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=1)
    parser.add_argument('--eval-interval',
                        type=int,
                        default=1e4,
                        help="After how many steps to evaluate the agent."
                        "(-1 -> always)")
    parser.add_argument('--n-hidden-channels', type=int, default=20)
    parser.add_argument('--n-hidden-layers', type=int, default=20)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--minibatch-size', type=int, default=None)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1)
    parser.add_argument('--time-step-limit', type=int, default=1e5)
    parser.add_argument('--outdir-time-suffix',
                        choices=['empty', 'none', 'time'],
                        default='empty',
                        type=str.lower)
    parser.add_argument('--checkpoint_frequency',
                        type=int,
                        default=1e3,
                        help="Nuber of steps to checkpoint after")
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='Use debug log-level')
    args = parser.parse_args()
    import logging
    logging.basicConfig(
        level=logging.INFO if not args.verbose else logging.DEBUG)

    # Set a random seed used in ChainerRL ALSO SETS NUMPY SEED!
    misc.set_random_seed(args.seed)

    if args.outdir and not args.load:
        outdir_suffix_dict = {
            'none': '',
            'empty': '',
            'time': '%Y%m%dT%H%M%S.%f'
        }
        args.outdir = experiments.prepare_output_dir(
            args,
            args.outdir,
            argv=sys.argv,
            time_format=outdir_suffix_dict[args.outdir_time_suffix])
    elif args.load:
        if args.load.endswith(os.path.sep):
            args.load = args.load[:-1]
        args.outdir = os.path.dirname(args.load)
        count = 0
        fn = os.path.join(args.outdir.format(count), 'scores_{:>03d}')
        while os.path.exists(fn.format(count)):
            count += 1
        os.rename(os.path.join(args.outdir, 'scores.txt'), fn.format(count))
        if os.path.exists(os.path.join(args.outdir, 'best')):
            os.rename(os.path.join(args.outdir, 'best'),
                      os.path.join(args.outdir, 'best_{:>03d}'.format(count)))

    logging.info('Output files are saved in {}'.format(args.outdir))

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def make_env(test):
        HOST = ''  # The server's hostname or IP address
        PORT = 54321  # The port used by the server
        if test:  # Just such that eval and train env don't have the same port
            PORT += 1

        # TODO don't hardcode env params
        # TODO if we use this solution (i.e. write port to file and read it with FD) we would have to make sure that
        # outdir doesn't append time strings. Otherwise it will get hard to use on the cluster
        env = FDEnvSelHeur(host=HOST,
                           port=PORT,
                           num_heuristics=2,
                           config_dir=args.outdir)
        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if ((args.render_eval and test) or (args.render_train and not test)):
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    # state = env.reset()
    # while True:
    # for x in [1,1,1,1,0,0,0,0]:
    #    state, reward, done, _ = env.step(x)
    #    print(x)
    #    if done:
    #        break

    timestep_limit = args.time_step_limit
    obs_space = env.observation_space
    obs_size = obs_space.low.size
    action_space = env.action_space

    if isinstance(action_space, spaces.Box):  # Usefull if we want to control
        action_size = action_space.low.size  # other continous parameters
        # Use NAF to apply DQN to continuous action spaces
        q_func = q_functions.FCQuadraticStateQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            action_space=action_space)
        # Use the Ornstein-Uhlenbeck process for exploration
        ou_sigma = (action_space.high - action_space.low) * 0.2
        explorer = explorers.AdditiveOU(sigma=ou_sigma)
    else:
        n_actions = action_space.n
        q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size,
            n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        # q_func = FCDuelingDQN(
        #     obs_size, n_actions)
        # Use epsilon-greedy for exploration
        explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    if not args.load:
        chainerrl.misc.draw_computational_graph(
            [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
            os.path.join(args.outdir, 'model'))

    opt = optimizers.Adam(eps=1e-2)
    logging.info('Optimizer: %s', str(opt))
    opt.setup(q_func)
    opt.add_hook(GradientClipping(5))

    rbuf_capacity = 5 * 10**5
    if args.minibatch_size is None:
        args.minibatch_size = 32
        # args.minibatch_size = 16
    if args.prioritized_replay:
        betasteps = (args.steps - args.replay_start_size) \
                    // args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                     betasteps=betasteps)
    else:
        rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    agent = DDQN(
        q_func,
        opt,
        rbuf,
        gamma=args.gamma,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        minibatch_size=args.minibatch_size,
        target_update_method=args.target_update_method,
        soft_update_tau=args.soft_update_tau,
    )
    t_offset = 0
    if args.load:  # Continue training model or load for evaluation
        agent.load(args.load)
        rbuf.load(os.path.join(args.load, 'replay_buffer.pkl'))
        try:
            t_offset = int(os.path.basename(args.load).split('_')[0])
        except TypeError:
            with open(os.path.join(args.load, 't.txt'), 'r') as fh:
                data = fh.readlines()
            t_offset = int(data[0])
        except ValueError:
            t_offset = 0

    eval_env = make_env(test=False)

    if args.evaluate:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        criterion = 'steps'  # can be made an argument if we support any other form of checkpointing
        l = logging.getLogger('Checkpoint_Hook')

        def checkpoint(env, agent, step):
            if criterion == 'steps':
                if step % args.checkpoint_frequency == 0:
                    save_agent_and_replay_buffer(
                        agent,
                        step,
                        args.outdir,
                        suffix='_chkpt',
                        logger=l,
                        chckptfrq=args.checkpoint_frequency)
            else:
                # TODO seems to checkpoint given wall_time we would have to modify the environment such that it tracks
                # time or number of episodes
                raise NotImplementedError

        hooks = [checkpoint]
        experiments.train_agent(agent=agent,
                                env=env,
                                steps=args.steps,
                                outdir=args.outdir,
                                step_hooks=hooks,
                                step_offset=t_offset)
コード例 #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='/tmp/chainerRL_results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--final-exploration-steps', type=int, default=10**4)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--evaluate',
                        action='store_true',
                        default=False,
                        help="Run evaluation mode")
    parser.add_argument('--load',
                        type=str,
                        default=None,
                        help="Load saved_model")
    parser.add_argument('--steps', type=int, default=10**6)
    parser.add_argument('--prioritized-replay', action='store_true')
    parser.add_argument('--replay-start-size', type=int, default=1000)
    parser.add_argument('--target-update-interval', type=int, default=10**2)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-interval', type=int, default=11)
    parser.add_argument('--n-hidden-channels', type=int, default=50)
    parser.add_argument('--n-hidden-layers', type=int, default=1)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--minibatch-size', type=int, default=None)
    parser.add_argument('--reward-scale-factor', type=float, default=1)
    parser.add_argument('--outdir-time-suffix',
                        choices=['empty', 'none', 'time'],
                        default='empty',
                        type=str.lower)
    parser.add_argument('--checkpoint_frequency',
                        type=int,
                        default=1e3,
                        help="Nuber of steps to checkpoint after")
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='Use debug log-level')
    parser.add_argument('--scenario',
                        choices=[
                            '1D-INST', '1D-DIST', '1DM', '2DM', '3DM', '5DM',
                            '1D3M', '2D3M', '3D3M', '5D3M'
                        ],
                        default='1D-INST',
                        type=str.upper,
                        help='Which scenario to use.')
    if __name__ != '__main__':
        print(__name__)
        parser.add_argument(
            '--timeout', type=int, default=0,
            help='Wallclock timeout in sec')  # Has no effect in this file!
        # can only be used in conjunction with "train_with_wallclock_limit.py"!
    args = parser.parse_args()
    import logging
    logging.basicConfig(
        level=logging.INFO if not args.verbose else logging.DEBUG)

    # Set a random seed used in ChainerRL ALSO SETS NUMPY SEED!
    misc.set_random_seed(args.seed)

    if args.outdir and not args.load:
        outdir_suffix_dict = {
            'none': '',
            'empty': '',
            'time': '%Y%m%dT%H%M%S.%f'
        }
        args.outdir = experiments.prepare_output_dir(
            args,
            args.outdir,
            argv=sys.argv,
            time_format=outdir_suffix_dict[args.outdir_time_suffix])
    elif args.load:
        if args.load.endswith(os.path.sep):
            args.load = args.load[:-1]
        args.outdir = os.path.dirname(args.load)
        count = 0
        fn = os.path.join(args.outdir.format(count), 'scores_{:>03d}')
        while os.path.exists(fn.format(count)):
            count += 1
        os.rename(os.path.join(args.outdir, 'scores.txt'), fn.format(count))
        if os.path.exists(os.path.join(args.outdir, 'best')):
            os.rename(os.path.join(args.outdir, 'best'),
                      os.path.join(args.outdir, 'best_{:>03d}'.format(count)))

    logging.info('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        if args.scenario == '1D-INST':  # Used to create Figures 2(b)&(c)
            env = SigMV(instance_feats=os.path.join(
                os.path.dirname(os.path.realpath(__file__)), '..', 'envs',
                'feats.csv' if not test else 'test_feats.csv'),
                        seed=args.seed,
                        n_actions=1,
                        action_vals=(2, ))
        elif args.scenario == '1D-DIST':  # Used to create Figure 2(a)
            env_seed = 2**32 - 1 - args.seed if test else args.seed
            env = SigMV(seed=env_seed, n_actions=1, action_vals=(2, ))
        elif args.scenario == '1D3M':  # Used to create Figure 3(a)
            env_seed = 2**32 - 1 - args.seed if test else args.seed
            env = SigMV(n_actions=1, action_vals=(3, ), seed=env_seed)
        elif args.scenario == '2D3M':  # Used to create Figure 3(b)
            env_seed = 2**32 - 1 - args.seed if test else args.seed
            env = SigMV(n_actions=2, action_vals=(3, 3), seed=env_seed)
        elif args.scenario == '3D3M':  # Used to create Figure 3(c)
            env_seed = 2**32 - 1 - args.seed if test else args.seed
            env = SigMV(n_actions=3, action_vals=(3, 3, 3), seed=env_seed)
        elif args.scenario == '5D3M':  # Used to create Figure 3(d)
            env_seed = 2**32 - 1 - args.seed if test else args.seed
            env = SigMV(n_actions=5,
                        action_vals=(3, 3, 3, 3, 3),
                        seed=env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        return env

    env = make_env(test=False)
    timestep_limit = 10**3  # TODO don't hardcode env params
    obs_space = env.observation_space
    obs_size = obs_space.low.size
    action_space = env.action_space

    n_actions = action_space.n
    q_func = q_functions.FCStateQFunctionWithDiscreteAction(
        obs_size,
        n_actions,
        n_hidden_channels=args.n_hidden_channels,
        n_hidden_layers=args.n_hidden_layers)
    explorer = explorers.LinearDecayEpsilonGreedy(args.start_epsilon,
                                                  args.end_epsilon,
                                                  args.final_exploration_steps,
                                                  action_space.sample)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    if not args.load:
        chainerrl.misc.draw_computational_graph(
            [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
            os.path.join(args.outdir, 'model'))

    opt = optimizers.Adam(eps=1e-2)
    opt.setup(q_func)
    opt.add_hook(GradientClipping(5))

    rbuf_capacity = 5 * 10**5
    if args.minibatch_size is None:
        args.minibatch_size = 32
    if args.prioritized_replay:
        betasteps = (args.steps - args.replay_start_size) \
                    // args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                     betasteps=betasteps)
    else:
        rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    agent = DDQN(
        q_func,
        opt,
        rbuf,
        gamma=args.gamma,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        minibatch_size=args.minibatch_size,
        target_update_method=args.target_update_method,
        soft_update_tau=args.soft_update_tau,
    )
    t_offset = 0
    if args.load:  # Continue training model or load for evaluation
        agent.load(args.load)
        rbuf.load(os.path.join(args.load, 'replay_buffer.pkl'))
        try:
            t_offset = int(os.path.basename(args.load).split('_')[0])
        except TypeError:
            with open(os.path.join(args.load, 't.txt'), 'r') as fh:
                data = fh.readlines()
            t_offset = int(data[0])
        except ValueError:
            t_offset = 0

    eval_env = make_env(test=True)

    if args.evaluate:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        criterion = 'steps'  # can be made an argument if we support any other form of checkpointing
        l = logging.getLogger('Checkpoint_Hook')

        def checkpoint(env, agent, step):
            if criterion == 'steps':
                if step % args.checkpoint_frequency == 0:
                    save_agent_and_replay_buffer(
                        agent,
                        step,
                        args.outdir,
                        suffix='_chkpt',
                        logger=l,
                        chckptfrq=args.checkpoint_frequency)
            else:
                # TODO seems to checkpoint given wall_time we would have to modify the environment such that it tracks
                # time or number of episodes
                raise NotImplementedError

        def eval_hook(env, agent, step):
            """
            Necessary hook to evaluate the DDQN on all 100 Training instances.
            :param env: The training environment
            :param agent: (Partially) Trained agent
            :param step: Number of observed training steps.
            :return:
            """
            if step % 10 == 0:  #
                train_reward = 0
                for _ in range(100):
                    obs = env.reset()
                    done = False
                    rews = 0
                    while not done:
                        obs, r, done, _ = env.step(agent.act(obs))
                        rews += r
                    train_reward += rews
                train_reward = train_reward / 100
                with open(os.path.join(args.outdir, 'train_reward.txt'),
                          'a') as fh:
                    fh.writelines(str(train_reward) + '\t' + str(step) + '\n')

        hooks = [checkpoint]
        if args.scenario == '1D-INST':
            hooks.append(eval_hook)
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=
            None,  # unlimited number of steps per evaluation rollout
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
            train_max_episode_len=timestep_limit,
            step_hooks=hooks,
            step_offset=t_offset)
コード例 #19
0
ファイル: train_dqn_gym.py プロジェクト: phymucs/chainerrl
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='Pendulum-v0')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=10**4)
    parser.add_argument('--start-epsilon', type=float, default=1.0)
    parser.add_argument('--end-epsilon', type=float, default=0.1)
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--steps', type=int, default=10**5)
    parser.add_argument('--prioritized-replay', action='store_true')
    parser.add_argument('--replay-start-size', type=int, default=1000)
    parser.add_argument('--target-update-interval', type=int, default=10**2)
    parser.add_argument('--target-update-method', type=str, default='hard')
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=1)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-interval', type=int, default=10**4)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--minibatch-size', type=int, default=None)
    parser.add_argument('--render-train', action='store_true')
    parser.add_argument('--render-eval', action='store_true')
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e-3)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def make_env(test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if ((args.render_eval and test) or (args.render_train and not test)):
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = env.observation_space
    obs_size = obs_space.low.size
    action_space = env.action_space

    if isinstance(action_space, spaces.Box):
        action_size = action_space.low.size
        # Use NAF to apply DQN to continuous action spaces
        q_func = q_functions.FCQuadraticStateQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            action_space=action_space)
        # Use the Ornstein-Uhlenbeck process for exploration
        ou_sigma = (action_space.high - action_space.low) * 0.2
        explorer = explorers.AdditiveOU(sigma=ou_sigma)
    else:
        n_actions = action_space.n
        q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size,
            n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        # Use epsilon-greedy for exploration
        explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros_like(obs_space.low, dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    opt = optimizers.Adam()
    opt.setup(q_func)

    rbuf_capacity = 5 * 10**5
    if args.minibatch_size is None:
        args.minibatch_size = 32
    if args.prioritized_replay:
        betasteps = (args.steps - args.replay_start_size) \
            // args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                     betasteps=betasteps)
    else:
        rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    agent = DQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        minibatch_size=args.minibatch_size,
        target_update_method=args.target_update_method,
        soft_update_tau=args.soft_update_tau,
    )

    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
            train_max_episode_len=timestep_limit)
コード例 #20
0
ファイル: train_dqn.py プロジェクト: zhjmcjk/chainerrl
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='BreakoutNoFrameskip-v4',
                        help='OpenAI Atari domain to perform algorithm on.')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=10**6,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon',
                        type=float,
                        default=0.1,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon',
                        type=float,
                        default=0.05,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--arch',
                        type=str,
                        default='doubledqn',
                        choices=['nature', 'nips', 'dueling', 'doubledqn'],
                        help='Network architecture to use.')
    parser.add_argument('--steps',
                        type=int,
                        default=5 * 10**7,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=5 * 10**4,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=1 * 10**4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--eval-interval',
                        type=int,
                        default=10**5,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval',
                        type=int,
                        default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--no-clip-delta',
                        dest='clip_delta',
                        action='store_false')
    parser.set_defaults(clip_delta=True)

    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--lr',
                        type=float,
                        default=2.5e-4,
                        help='Learning rate.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions),
                            DiscreteActionValue)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(lr=args.lr,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.DQN
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=args.target_update_interval,
                  clip_delta=args.clip_delta,
                  update_interval=args.update_interval,
                  batch_accumulator='sum',
                  phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            eval_env=eval_env,
        )
コード例 #21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='CarIntersect-v3')
    parser.add_argument('--outdir',
                        type=str,
                        default='train/results',
                        help='Directory path to save output files.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', action='store_true', default=None)
    parser.add_argument('--train', action='store_true', default=None)
    parser.add_argument('--eval-epsilon', type=float, default=0.0)
    parser.add_argument('--noisy-net-sigma', type=float, default=0.5)
    parser.add_argument('--steps', type=int, default=2 * 10**6)
    parser.add_argument('--replay-start-size', type=int, default=2 * 10**4)
    parser.add_argument('--eval-n-episodes', type=int, default=5)
    parser.add_argument('--eval-interval', type=int, default=10**4)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env.')
    parser.add_argument('--num-envs', type=int, default=40)
    parser.add_argument('--final-epsilon', type=float, default=0.01)
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=2 * 10**4)
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs

    def make_car_env_discrete(max_frames=30 * 30,
                              env_seed=42,
                              random_suffix=None):
        print('CarIntersect-v3')
        env = gym.make('CarIntersect-v3')
        env = chainerrl.wrappers.ContinuingTimeLimit(
            env, max_episode_steps=max_frames)
        env = MaxAndSkipEnv(env, skip=4)
        env = DiscreteWrapper(env)
        print('save_wrapper')
        env = SaveWrapper(env, random_suffix=random_suffix)
        env = WarpFrame(env)
        env.seed(env_seed)
        return env

    def make_batch_env(test):
        vec_env = chainerrl.envs.MultiprocessVectorEnv([
            functools.partial(make_car_env_discrete)
            for _, _ in enumerate(range(args.num_envs))
        ])
        vec_env = chainerrl.wrappers.VectorFrameStack(vec_env, 4)
        # print(vec_env.observation_space)
        return vec_env

    env = make_batch_env(test=False)

    n_actions = env.action_space.n

    n_atoms = 51
    v_max = 10
    v_min = -10
    q_func = DistributionalDuelingDQN(n_actions,
                                      n_atoms,
                                      v_min,
                                      v_max,
                                      n_input_channels=12)

    # Noisy nets
    links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
    # Turn off explorer
    explorer = explorers.LinearDecayEpsilonGreedy(
        0.3, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    # Draw the computational graph and save it in the output directory.
    # chainerrl.misc.draw_computational_graph(
    #     [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
    #     os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as https://arxiv.org/abs/1707.06887
    opt = chainer.optimizers.Adam(0.00025, eps=1.5 * 10**-4)
    opt.setup(q_func)

    # Prioritized Replay
    # Anneal beta from beta0 to 1 throughout training
    update_interval = 4
    betasteps = args.steps / update_interval
    rbuf = replay_buffer.PrioritizedReplayBuffer(10**5,
                                                 alpha=0.5,
                                                 beta0=0.4,
                                                 betasteps=betasteps,
                                                 num_steps=10)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.CategoricalDoubleDQN
    print(args.replay_start_size)

    agent = Agent(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        minibatch_size=64,
        replay_start_size=args.replay_start_size,
        target_update_interval=3 * 10**3,
        update_interval=update_interval,
        batch_accumulator='mean',
        phi=phi,
    )

    if args.load is True:
        print('evaluation started')
        dir_of_best_network = os.path.join("train/", "best")
        agent.load(dir_of_best_network)

        stats = experiments.evaluator.eval_performance(env=env,
                                                       agent=agent,
                                                       n_steps=None,
                                                       n_episodes=10,
                                                       logger=None)
        print(stats)

    if args.train or not args.load:
        print('training started')
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_episodes,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=True,
            log_interval=1000,
        )
コード例 #22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames', type=int, default=10**6)
    parser.add_argument('--final-epsilon', type=float, default=0.01)
    parser.add_argument('--eval-epsilon', type=float, default=0.001)
    parser.add_argument('--noisy-net-sigma', type=float, default=None)
    parser.add_argument('--arch',
                        type=str,
                        default='doubledqn',
                        choices=['nature', 'nips', 'dueling', 'doubledqn'])
    parser.add_argument('--steps', type=int, default=5 * 10**7)
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--replay-start-size', type=int, default=5 * 10**4)
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=3 * 10**4)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--update-interval', type=int, default=4)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--no-clip-delta',
                        dest='clip_delta',
                        action='store_false')
    parser.set_defaults(clip_delta=True)
    parser.add_argument('--agent',
                        type=str,
                        default='DoubleDQN',
                        choices=['DQN', 'DoubleDQN', 'PAL'])
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--lr',
                        type=float,
                        default=2.5e-4,
                        help='Learning rate')
    parser.add_argument('--prioritized',
                        action='store_true',
                        default=False,
                        help='Use prioritized experience replay.')
    parser.add_argument('--num-envs', type=int, default=1)
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(idx, test):
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
            frame_stack=False,
        )
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        env.seed(env_seed)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        vec_env = chainerrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(args.num_envs))
        ])
        vec_env = chainerrl.wrappers.VectorFrameStack(vec_env, 4)
        return vec_env

    sample_env = make_env(0, test=False)

    n_actions = sample_env.action_space.n
    q_func = parse_arch(args.arch, n_actions)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(lr=args.lr,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    # Select a replay buffer to use
    if args.prioritized:
        # Anneal beta from beta0 to 1 throughout training
        betasteps = args.steps / args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(10**6,
                                                     alpha=0.6,
                                                     beta0=0.4,
                                                     betasteps=betasteps)
    else:
        rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = parse_agent(args.agent)
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.99,
                  explorer=explorer,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=args.target_update_interval,
                  clip_delta=args.clip_delta,
                  update_interval=args.update_interval,
                  batch_accumulator='sum',
                  phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=make_batch_env(test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(test=False),
            eval_env=make_batch_env(test=True),
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            log_interval=1000,
        )
コード例 #23
0
ファイル: dqn.py プロジェクト: chokozainer/chokozainerrl
def chokoDQN(env, args=None):
    args = args or []
    if (type(args) is list):
        args = make_args(args)

    obs_space = env.observation_space
    obs_size = obs_space.low.size * args.stack_k
    action_space = env.action_space

    if isinstance(action_space, spaces.Box):
        action_size = action_space.low.size
        q_func = q_functions.FCQuadraticStateQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            action_space=action_space)
        # Use the Ornstein-Uhlenbeck process for exploration
        ou_sigma = (action_space.high - action_space.low) * 0.2
        explorer = explorers.AdditiveOU(sigma=ou_sigma)
    else:
        n_actions = action_space.n
        q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size,
            n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        # Use epsilon-greedy for exploration
        explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon, args.end_epsilon, args.final_exploration_steps,
            action_space.sample)

    if args.noisy_net_sigma is not None:
        links.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
        # Turn off explorer
        explorer = explorers.Greedy()

    opt = optimizers.Adam()
    opt.setup(q_func)

    rbuf_capacity = 5 * 10**5
    if args.minibatch_size is None:
        args.minibatch_size = 32
    if args.prioritized_replay:
        betasteps = (args.steps - args.replay_start_size) \
            // args.update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(rbuf_capacity,
                                                     betasteps=betasteps)
    else:
        rbuf = replay_buffer.ReplayBuffer(rbuf_capacity)

    agent = DQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        minibatch_size=args.minibatch_size,
        target_update_method=args.target_update_method,
        soft_update_tau=args.soft_update_tau,
    )
    return agent