Ejemplo n.º 1
0
def make_a3c_agent(obs_space_dim, action_space_dim):
    model = A3CLSTMGaussian(obs_space_dim, action_space_dim)
    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    agent = a3c.A3C(model, opt, t_max=5, gamma=1, beta=1e-2, phi=phi)
    return agent
Ejemplo n.º 2
0
def get_pretrained_agent(agent_path="./"):
    model = pretrained_NN(ndim_obs=9, n_actions=9)
    opt = rmsprop_async.RMSpropAsync()
    opt.setup(model)
    agent = a3c.A3C(model, opt, t_max=5, gamma=0.99, beta=1e-2)
    agent.load(agent_path)
    return agent
Ejemplo n.º 3
0
    def create_a3c_agent():
        # env = gym.make('malware-v0')
        # obs_size = env.observation_space.shape[1]
        # action_space = env.action_space
        # n_actions = action_space.n
        obs_size = 8006
        n_actions = 6

        # Switch policy types accordingly to action space types
        if args.arch == 'FFSoftmax':
            model = A3CFFSoftmax(obs_size, n_actions)
        elif args.arch == 'FFMellowmax':
            model = A3CFFMellowmax(obs_size, n_actions)

        if args.gpu:
            pass
        #     model.to_gpu(0)

        opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                         eps=args.rmsprop_epsilon,
                                         alpha=0.99)
        opt.setup(model)

        agent = a3c.A3C(model,
                        opt,
                        t_max=args.t_max,
                        gamma=0.99,
                        beta=args.beta)

        return agent
Ejemplo n.º 4
0
 def __init__(self, gpu=False):
     self.model = A3CFFSoftmax(gpu)
     if gpu:
         self.model.to_gpu(0)
     self.optimizer = rmsprop_async.RMSpropAsync(lr=7e-4,
                                                 eps=1e-1,
                                                 alpha=0.99)
     self.agent = a3c.A3C(self.model,
                          self.optimizer,
                          t_max=5,
                          gamma=0.99,
                          beta=1e-2,
                          phi=phi)
     self.add_hooks = [chainer.optimizer.GradientClipping(40)]
Ejemplo n.º 5
0
def CREATE_AGENT(env, agent_name):

    gamma= 0.9

    if agent_name == "DoubleDQN":
        q_func= QFunction(env.OBSDIM*(NUM_EYES), env.action_space_d.n)
        # q_func = chainerrl.q_functions.FCStateQFunctionWithDiscreteAction(
        #    env.OBSDIM*(NUM_EYES), env.action_space_d.n,
        #    n_hidden_layers=2, n_hidden_channels=50)

        # q_func.to_gpu(0)

        optimizer= chainer.optimizers.Adam(eps=1e-2)
        optimizer.setup(q_func)

        # explorer = chainerrl.explorers.ConstantEpsilonGreedy(
        #     epsilon=0.3, random_action_func=env.action_space_d.sample)

        explorer= chainerrl.explorers.LinearDecayEpsilonGreedy(
            start_epsilon=0.5, end_epsilon=0.1, decay_steps=10000, random_action_func=env.action_space_d.sample)

        replay_buffer= chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)

        agent= chainerrl.agents.DoubleDQN(
            q_func, optimizer, replay_buffer, gamma, explorer,
            replay_start_size=500, update_interval=1,
            target_update_interval=100)

        return agent

    if agent_name == "A3CFF":
        #        n_actions = ale.ALE(str(env.action_space_d.n)).number_of_actions
        #        model = A3CFF(n_actions)
        model= A3CFF(env.OBSDIM*(NUM_EYES), env.action_space_d.n)

        optimizer= rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.9)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.GradientClipping(40))

        agent= a3c.A3C(model, optimizer, t_max=4,
                        gamma=0.9, beta=1e-2, phi=dqn_phi)

        return agent
Ejemplo n.º 6
0
    def _test_abc(self,
                  t_max,
                  use_lstm,
                  discrete=True,
                  episodic=True,
                  steps=1000000):

        nproc = 8

        def make_env(process_idx, test):
            size = 2
            return ABC(size=size,
                       discrete=discrete,
                       episodic=episodic or test,
                       partially_observable=self.use_lstm,
                       deterministic=test)

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        def phi(x):
            return x

        n_hidden_channels = 20
        if use_lstm:
            if discrete:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCSoftmaxPolicy(
                        n_hidden_channels,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                )
            else:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCGaussianPolicy(
                        n_hidden_channels,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        bound_mean=True,
                        min_action=action_space.low,
                        max_action=action_space.high),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                )
        else:
            if discrete:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCSoftmaxPolicy(
                        obs_space.low.size,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                )
            else:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCGaussianPolicy(
                        obs_space.low.size,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        bound_mean=True,
                        min_action=action_space.low,
                        max_action=action_space.high),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2),
                )
        eps = 1e-1 if discrete else 1e-2
        opt = rmsprop_async.RMSpropAsync(lr=5e-4, eps=eps, alpha=0.99)
        opt.setup(model)
        gamma = 0.9
        beta = 1e-2
        agent = a3c.A3C(model,
                        opt,
                        t_max=t_max,
                        gamma=gamma,
                        beta=beta,
                        phi=phi,
                        act_deterministically=True)

        max_episode_len = None if episodic else 2

        train_agent_async(outdir=self.outdir,
                          processes=nproc,
                          make_env=make_env,
                          agent=agent,
                          steps=steps,
                          max_episode_len=max_episode_len,
                          eval_interval=500,
                          eval_n_runs=5,
                          successful_score=1)

        # The agent returned by train_agent_async is not guaranteed to be
        # successful because parameters could be modified by other processes
        # after success. Thus here the successful model is loaded explicitly.
        agent.load(os.path.join(self.outdir, 'successful'))
        agent.stop_episode()

        # Test
        env = make_env(0, True)
        n_test_runs = 5

        for _ in range(n_test_runs):
            total_r = 0
            obs = env.reset()
            done = False
            reward = 0.0

            while not done:
                action = agent.act(obs)
                print('state:', obs, 'action:', action)
                obs, reward, done, _ = env.step(action)
                total_r += reward
            self.assertAlmostEqual(total_r, 1)
            agent.stop_episode()
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int, default=4)  # increase for more asynchronous workers
    parser.add_argument('--outdir', type=str, default='a3c_training', help='Directory path to save output files. If it does not exist, it will be created.')  # set directory to which output files will be written
    parser.add_argument('--env', type=str, default='1DIsing-A3C-v0')  # specify environment to explore

    parser.add_argument('--steps', type=int, default=1 * 10 ** 7)  # maximum number of steps before training ends
    parser.add_argument('--eval-interval', type=int, default=10**4)  # frequency at which the agent will be evaluated
    parser.add_argument('--eval-n-runs', type=int, default=10)  # number of evaluation runs per evaluation
    parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax'))  # NN to use for policy and state value estimates
    parser.add_argument('--t-max', type=int, default=5)  # increase for later truncation of the sum
    parser.add_argument('--beta', type=float, default=1e-2)    # increase for more exploration
    parser.add_argument('--gamma', type=float, default=0.99)    # increase for less discount of future rewards
    parser.add_argument('--lr', type=float, default=1 * 1e-4)  # decrease for slower learning rate
    parser.add_argument('--weight-decay', type=float, default=0)  # turn on to get weight decay

    parser.add_argument('--seed', type=int, default=17, help='Random seed [0, 2 ** 32)')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e0)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--logger-level', type=int, default=logging.ERROR)  # set to logging.DEBUG for (much more) information
    parser.add_argument('--monitor', action='store_true')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2 ** 32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.max_episode_steps
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    model = A3CFFSoftmax(obs_space.low.size, action_space.n)

    opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=args.gamma, beta=args.beta)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            max_episode_len=timestep_limit)
Ejemplo n.º 8
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--arch',
                        type=str,
                        default='FFSoftmax',
                        choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian'))
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Switch policy types accordingly to action space types
    if args.arch == 'LSTMGaussian':
        model = A3CLSTMGaussian(obs_space.low.size, action_space.low.size)
    elif args.arch == 'FFSoftmax':
        model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A3CFFMellowmax(obs_space.low.size, action_space.n)

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.99, beta=args.beta)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_steps=None,
                                      eval_n_episodes=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)
Ejemplo n.º 9
0
def main():

    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('rom', type=str)
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--use-sdl', action='store_true')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--max-episode-len', type=int, default=10000)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--eval-interval', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--use-lstm', action='store_true')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.set_defaults(use_sdl=False)
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    n_actions = ale.ALE(args.rom).number_of_actions

    if args.use_lstm:
        model = A3CLSTM(n_actions)
    else:
        model = A3CFF(n_actions)

    # Draw the computational graph and save it in the output directory.
    fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None],
                                name='observation')
    with chainerrl.recurrent.state_reset(model):
        # The state of the model is reset again after drawing the graph
        chainerrl.misc.draw_computational_graph([model(fake_obs)],
                                                os.path.join(
                                                    args.outdir, 'model'))

    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = a3c.A3C(model,
                    opt,
                    t_max=args.t_max,
                    gamma=0.99,
                    beta=args.beta,
                    phi=dqn_phi)
    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = ale.ALE(args.rom,
                      use_sdl=args.use_sdl,
                      treat_life_lost_as_terminal=not test,
                      seed=env_seed)
        if not test:
            misc.env_modifiers.make_reward_clipped(env, -1, 1)
        return env

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=args.max_episode_len,
                                      global_step_hooks=[lr_decay_hook])
Ejemplo n.º 10
0
def make_chainer_a3c(obs_size, action_size):
    model = A3CLSTMGaussian(obs_size, action_size)
    opt = optimizers.Adam(eps=1e-2)
    opt.setup(model)
    agent = a3c.A3C(model, opt, 10 ** 5, 0.9)
    return agent
Ejemplo n.º 11
0
    fake_obs = chainer.Variable(np.zeros(obs_size, dtype=np.float32)[None], name='observation')
    with chainerrl.recurrent.state_reset(model):
        # The state of the model is reset again after drawing the graph
        chainerrl.misc.draw_computational_graph(
                [model(fake_obs)],
                os.path.join(args.outdir, 'model'))

    opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.eps, alpha=args.alpha)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(args.gclipping))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    phi = lambda x: x.astype(np.float32, copy=False)

    agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=args.gamma,
                    beta=args.beta, phi=phi)

    lr_decay_hook = experiments.LinearInterpolationHook(args.steps, args.lr, 0, lr_setter)

    training = experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.threads,
            make_env=make_env,
            profile=False,
            steps=args.steps,
            eval_interval=args.eval_interval,
            eval_n_episodes=args.eval_n_runs,
            max_episode_len=args.max_episode_len,
            successful_score=args.stop,
            global_step_hooks=[lr_decay_hook],
Ejemplo n.º 12
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--arch',
                        type=str,
                        default='FFSoftmax',
                        choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian'))
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    args = parser.parse_args()

    logging.getLogger().setLevel(args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            misc.env_modifiers.make_rendered(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Switch policy types accordingly to action space types
    if args.arch == 'LSTMGaussian':
        model = A3CLSTMGaussian(obs_space.low.size, action_space.low.size)
    elif args.arch == 'FFSoftmax':
        model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    elif args.arch == 'FFMellowmax':
        model = A3CFFMellowmax(obs_space.low.size, action_space.n)

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    agent = a3c.A3C(model,
                    opt,
                    t_max=args.t_max,
                    gamma=0.99,
                    beta=args.beta,
                    phi=phi)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)
Ejemplo n.º 13
0
def main():
    import argparse
    parser = argparse.ArgumentParser()

    # Simulation params
    parser.add_argument('--name', type=str, default='EIIE_A3C')
    parser.add_argument('--processes',
                        type=int,
                        default=4,
                        help="number of environment instances to use")
    parser.add_argument('--seed', type=int, default=42, help="random seed")
    parser.add_argument(
        '--obs_steps',
        type=int,
        default=64,
        help=
        "Observation steps, number of candles required by the agent for calculations"
    )
    parser.add_argument(
        '--period',
        type=int,
        default=30,
        help="Observation period in minutes, also trading frequency")
    parser.add_argument('--max_episode_len',
                        type=int,
                        default=5,
                        help="Max timesteps per episode")
    parser.add_argument('--steps',
                        type=int,
                        default=1e6,
                        help="Training steps")
    parser.add_argument('--eval-interval', type=int, default=None)
    parser.add_argument('--eval-n-runs', type=int, default=10)

    # Learning params
    parser.add_argument('--n_filters_in',
                        type=int,
                        default=8,
                        help="number of input filters heads to train")
    parser.add_argument('--n_filters_out',
                        type=int,
                        default=256,
                        help="number of pattern recognition neurons to train")
    parser.add_argument('--t_max',
                        type=int,
                        default=5,
                        help="Timesteps before update main model")
    parser.add_argument('--grad_noise',
                        type=float,
                        default=0.0,
                        help="gradient noise to apply")
    parser.add_argument('--lr', type=float, default=1e-4, help="learning rate")
    parser.add_argument('--lr_decay',
                        type=float,
                        default=1000,
                        help="learning rate linear decay rate")
    parser.add_argument(
        '--alpha',
        type=float,
        default=0.99,
        help=
        "Exponential decay rate of the second order moment for rmsprop optimizer"
    )
    parser.add_argument('--rmsprop-epsilon',
                        type=float,
                        default=1e-1,
                        help="fuzz factor")
    parser.add_argument('--clip_grad',
                        type=float,
                        default=100,
                        help="Clip gradient norm")
    parser.add_argument('--reward-scale-factor',
                        type=float,
                        default=1.,
                        help="scale environment reward")

    # Regularization
    parser.add_argument('--beta',
                        type=float,
                        default=1e-3,
                        help="entropy regularization weight for policy")
    parser.add_argument('--beta_decay',
                        type=float,
                        default=1000,
                        help="entropy regularization decay rate")
    parser.add_argument('--l2_reg',
                        type=float,
                        default=0,
                        help="l2 regularization coefficient")
    parser.add_argument('--l1_reg',
                        type=float,
                        default=0,
                        help="l1 regularization coefficient")
    parser.add_argument('--gamma',
                        type=float,
                        default=0.999,
                        help="discount factor")

    # Misc
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--monitor', action='store_true', default=False)
    parser.add_argument('--download', action='store_true', default=False)
    parser.add_argument('--datafeed', action='store_true', default=False)

    # Dir options
    parser.add_argument('--data_dir', type=str, default='./data')
    parser.add_argument('--out_dir', type=str, default='./save')
    parser.add_argument('--load_dir', type=str, default='./save')
    parser.add_argument('--log_dir', type=str, default='./logs')
    parser.add_argument('--logger-level', type=int, default=logging.ERROR)

    args = parser.parse_args()
    logging.getLogger().setLevel(args.logger_level)
    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.log_dir)

    # Simulation Params
    # Universe
    pairs = [
        'BTC_ETH', 'BTC_BCH', 'BTC_XRP', 'BTC_STR', 'BTC_LTC', 'BTC_DASH',
        'BTC_XMR', 'BTC_ETC', 'BTC_ZEC', 'BTC_BTS', 'BTC_LSK', 'BTC_XEM',
        'BTC_VTC', 'BTC_STRAT', 'BTC_EMC2', 'BTC_NXT', 'BTC_OMG'
    ]  # Universe, some survivor bias here...
    fiat_symbol = 'BTC'  # Quote symbol

    init_funds = make_balance(
        crypto=0.0, fiat=100.0,
        pairs=pairs)  # Initial equally distributed portfolio

    # NN params
    timesteps = args.obs_steps - 1
    n_filters_in = args.n_filters_in
    n_filters_out = args.n_filters_out

    if args.load_dir:
        try:
            last_save = np.argmax([
                int(d.split('_')[0]) for d in listdir(args.load_dir)
                if '.' not in d
            ])
            load_dir = args.load_dir + "/" + listdir(args.load_dir)[last_save]
            global_t = int(
                listdir(args.load_dir)[int(last_save)].split('_')[0])
        except ValueError as e:
            load_dir = False
            global_t = 0
    else:
        load_dir = None
        global_t = 0

    # Make environment function
    if args.datafeed:
        papi = DataFeed(args.period, pairs, 'polo_test', 'ipc://feed.ipc')
    else:
        papi = BacktestDataFeed(Poloniex(), args.period, pairs)

    if args.download:
        print("Downloading data...")
        papi.download_data(
            end=datetime.timestamp(datetime.utcnow() - timedelta(days=50)),
            start=datetime.timestamp(datetime.utcnow() - timedelta(days=300)))
        papi.save_data(args.data_dir + '/train')

    def make_env(process_idx, test):
        tapi = BacktestDataFeed(papi,
                                args.period,
                                pairs=pairs,
                                balance=init_funds,
                                load_dir=args.data_dir)
        tapi.load_data('/train')

        # Environment setup
        env = BacktestEnvironment(args.period, args.obs_steps, tapi,
                                  fiat_symbol, args.name)
        env.setup()

        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)

        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)

        return env

    # Model declaration
    print("Instantiating model")
    model = cn_agents.A3CEIIE(timesteps,
                              len(pairs) + 1, n_filters_in,
                              n_filters_out)  #.to_gpu(0)

    # Optimizer
    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=args.alpha)
    opt.setup(model)
    if args.clip_grad:
        opt.add_hook(cn.optimizer.GradientClipping(args.clip_grad))
    if args.grad_noise:
        opt.add_hook(cn.optimizer.GradientNoise(args.grad_noise))
    if args.l2_reg:
        opt.add_hook(cn.optimizer.WeightDecay(args.l2_reg))
    if args.l1_reg:
        opt.add_hook(cn.optimizer.Lasso(args.l1_reg))

    # Agent
    print("Building agent")
    agent = a3c.A3C(model,
                    opt,
                    t_max=args.t_max,
                    gamma=args.gamma,
                    beta=args.beta,
                    phi=phi,
                    normalize_grad_by_t_max=True,
                    act_deterministically=False,
                    v_loss_coef=1.0)

    # Load information
    if load_dir:
        agent.load(load_dir)
        print("Model loaded from %s" % (load_dir))

    # Training hooks
    pp = PrintProgress(time())

    def lr_setter(env, agent, value):
        agent.optimizer.lr = value

    def beta_setter(env, agent, value):
        agent.beta = value

    lr_decay = LinearInterpolationHook(int(args.steps), args.lr,
                                       args.lr / args.lr_decay, lr_setter)
    beta_decay = LinearInterpolationHook(int(3 * args.steps / 4), args.beta,
                                         args.beta / args.beta_decay,
                                         beta_setter)

    # Training session
    try:
        print("Training starting...\n")
        with np.errstate(divide='ignore'):
            experiments.train_agent_async(
                agent=agent,
                outdir=args.out_dir,
                processes=args.processes,
                make_env=make_env,
                profile=args.profile,
                steps=int(args.steps),
                eval_n_runs=args.eval_n_runs,
                eval_interval=args.eval_interval,
                max_episode_len=args.max_episode_len,
                global_step_hooks=[pp, lr_decay, beta_decay],
                resume_step=global_t)
    except KeyboardInterrupt:
        print("\nThx for the visit. Good bye.")
Ejemplo n.º 14
0
    def _test_abc(self,
                  t_max,
                  use_lstm,
                  discrete=True,
                  episodic=True,
                  steps=100000,
                  require_success=True):

        nproc = 8

        def make_env(process_idx, test):
            size = 2
            return ABC(size=size,
                       discrete=discrete,
                       episodic=episodic or test,
                       partially_observable=self.use_lstm,
                       deterministic=test)

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        def phi(x):
            return x

        n_hidden_channels = 20
        if use_lstm:
            if discrete:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCSoftmaxPolicy(
                        n_hidden_channels,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                )
            else:
                model = a3c.A3CSharedModel(
                    shared=L.LSTM(obs_space.low.size, n_hidden_channels),
                    pi=policies.FCGaussianPolicy(
                        n_hidden_channels,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        mean_wscale=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        n_hidden_channels,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                )
        else:
            if discrete:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCSoftmaxPolicy(
                        obs_space.low.size,
                        action_space.n,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                )
            else:
                model = a3c.A3CSeparateModel(
                    pi=policies.FCGaussianPolicy(
                        obs_space.low.size,
                        action_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        mean_wscale=1e-1,
                    ),
                    v=v_function.FCVFunction(
                        obs_space.low.size,
                        n_hidden_channels=n_hidden_channels,
                        n_hidden_layers=2,
                        nonlinearity=F.tanh,
                        last_wscale=1e-1,
                    ),
                )
        opt = chainer.optimizers.Adam()
        opt.setup(model)
        opt.add_hook(chainer.optimizer.GradientClipping(1))
        gamma = 0.8
        beta = 1e-2
        agent = a3c.A3C(model,
                        opt,
                        t_max=t_max,
                        gamma=gamma,
                        beta=beta,
                        phi=phi,
                        act_deterministically=True)

        max_episode_len = None if episodic else 2

        with warnings.catch_warnings(record=True) as warns:
            train_agent_async(outdir=self.outdir,
                              processes=nproc,
                              make_env=make_env,
                              agent=agent,
                              steps=steps,
                              max_episode_len=max_episode_len,
                              eval_interval=500,
                              eval_n_steps=None,
                              eval_n_episodes=5,
                              successful_score=1)
            assert len(warns) == 0, warns[0]

        # The agent returned by train_agent_async is not guaranteed to be
        # successful because parameters could be modified by other processes
        # after success. Thus here the successful model is loaded explicitly.
        if require_success:
            agent.load(os.path.join(self.outdir, 'successful'))
        agent.stop_episode()

        # Test
        env = make_env(0, True)
        n_test_runs = 5

        for _ in range(n_test_runs):
            total_r = 0
            obs = env.reset()
            done = False
            reward = 0.0

            while not done:
                action = agent.act(obs)
                print('state:', obs, 'action:', action)
                obs, reward, done, _ = env.step(action)
                total_r += reward
            if require_success:
                self.assertAlmostEqual(total_r, 1)
            agent.stop_episode()
Ejemplo n.º 15
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--arch',
                        type=str,
                        default='FFSoftmax',
                        choices=('FFSoftmax', 'FFMellowmax', 'LSTMGaussian'))
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1)
    parser.add_argument('--render', action='store_true', default=True)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    args = parser.parse_args()

    logging.getLogger().setLevel(args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        print('in make env', process_idx, test)
        # env = gym.make(args.env)
        env = env_vrep.Simu_env(10000 + process_idx)
        if processor_status[process_idx] == 0:
            env.connect_vrep()
            processor_status[process_idx] = 1
        return env

    # sample_env = gym.make(args.env)
    # timestep_limit = sample_env.spec.tags.get(
    #     'wrapper_config.TimeLimit.max_episode_steps')
    # obs_space = sample_env.observation_space
    # action_space = sample_env.action_space
    obs_space = env_vrep.state_size
    action_space = env_vrep.action_size
    timestep_limit = 200

    model = A3CFFSoftmax(obs_space, action_space)

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    # opt.add_hook(chainer.optimizer.GradientClipping(40))
    # if args.weight_decay > 0:
    #     opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    agent = a3c.A3C(model,
                    opt,
                    t_max=args.t_max,
                    gamma=0.99,
                    beta=args.beta,
                    phi=phi)
    if args.load:
        agent.load(args.load)

    experiments.train_agent_async(agent=agent,
                                  outdir=args.outdir,
                                  processes=args.processes,
                                  make_env=make_env,
                                  profile=args.profile,
                                  steps=args.steps,
                                  eval_n_runs=args.eval_n_runs,
                                  eval_interval=args.eval_interval,
                                  max_episode_len=timestep_limit)
Ejemplo n.º 16
0
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if (type(args) is list):
        args = make_args(args)
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # Set a random seed used in ChainerRL.
    # If you use more than one process (i.e. processes > 1),
    # the results will be no longer be deterministic
    # even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    n_actions = gym.make(args.env).action_space.n

    if args.use_lstm:
        model = A3CLSTM(n_actions)
    else:
        model = A3CFF(n_actions)

    # Draw the computational graph and save it in the output directory.
    fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None],
                                name='observation')
    with chainerrl.recurrent.state_reset(model):
        # The state of the model is reset again after drawing the graph
        chainerrl.misc.draw_computational_graph([model(fake_obs)],
                                                os.path.join(
                                                    args.outdir, 'model'))

    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = a3c.A3C(model,
                    opt,
                    t_max=args.t_max,
                    gamma=0.99,
                    beta=args.beta,
                    phi=phi)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    def make_env_check():
        # Use different random seeds for train and test envs
        env_seed = args.seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=True,
                                           clip_rewards=True)
        env.seed(int(env_seed))
        return env

    if args.load_agent:
        agent.load(args.load_agent)

    if (args.mode == 'train'):
        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            step_offset=args.step_offset,
            checkpoint_freq=args.checkpoint_frequency,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=False,
            log_type=args.log_type)
    elif (args.mode == 'check'):
        return tools.make_video.check(env=make_env_check(),
                                      agent=agent,
                                      save_mp4=args.save_mp4)

    elif (args.mode == 'growth'):
        return tools.make_video.growth(env=make_env_check(),
                                       agent=agent,
                                       outdir=args.outdir,
                                       max_num=args.max_frames,
                                       save_mp4=args.save_mp4)
Ejemplo n.º 17
0
    def __init__(self, alg, env, model_path):
        self.alg = alg
        seed = 0
        n_actions = gym.make(env).action_space.n
        gpus = [-1]
        gpu = None
        misc.set_random_seed(seed, gpus=gpus)
        if alg == "DQN-C":
            model = links.Sequence(
                links.NatureDQNHead(),
                L.Linear(512, n_actions),
                DiscreteActionValue)
        if alg == "PPO":
            winit_last = chainer.initializers.LeCunNormal(1e-2)
            model = chainer.Sequential(
                L.Convolution2D(None, 32, 8, stride=4),
                F.relu,
                L.Convolution2D(None, 64, 4, stride=2),
                F.relu,
                L.Convolution2D(None, 64, 3, stride=1),
                F.relu,
                L.Linear(None, 512),
                F.relu,
                links.Branched(
                    chainer.Sequential(
                        L.Linear(None, n_actions, initialW=winit_last),
                        SoftmaxDistribution,
                    ),
                    L.Linear(None, 1),
                )
            )
        if alg == "C51":
            n_atoms = 51
            v_max = 10
            v_min = -10
            model = links.Sequence(
                links.NatureDQNHead(),
                DistributionalFCStateQFunctionWithDiscreteAction(
                    None, n_actions, n_atoms, v_min, v_max,
                    n_hidden_channels=0, n_hidden_layers=0),
            )
        if alg == "ACER":
            model = agents.acer.ACERSharedModel(
                shared=links.Sequence(
                    links.NIPSDQNHead(),
                    L.LSTM(256, 256)),
                pi=links.Sequence(
                    L.Linear(256, n_actions),
                    SoftmaxDistribution),
                q=links.Sequence(
                    L.Linear(256, n_actions),
                    DiscreteActionValue),
            )
        if alg == "A3C":
            model = A3CFF(n_actions)
        if alg == "Rainbow":
            n_atoms = 51
            v_max = 10
            v_min = -10
            model = DistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max)
            links.to_factorized_noisy(model, sigma_scale=0.5)
        if alg == "IQN":
            model = agents.iqn.ImplicitQuantileQFunction(
                psi=chainerrl.links.Sequence(
                    L.Convolution2D(None, 32, 8, stride=4),
                    F.relu,
                    L.Convolution2D(None, 64, 4, stride=2),
                    F.relu,
                    L.Convolution2D(None, 64, 3, stride=1),
                    F.relu,
                    functools.partial(F.reshape, shape=(-1, 3136)),
                ),
                phi=chainerrl.links.Sequence(
                    chainerrl.agents.iqn.CosineBasisLinear(64, 3136),
                    F.relu,
                ),
                f=chainerrl.links.Sequence(
                    L.Linear(None, 512),
                    F.relu,
                    L.Linear(None, n_actions),
                ),
            )
        if alg in ["A3C"]:
            fake_obs = chainer.Variable(
                np.zeros((4, 84, 84), dtype=np.float32)[None],
                name='observation')
            with chainerrl.recurrent.state_reset(model):
                # The state of the model is reset again after drawing the graph
                variables = misc.collect_variables([model(fake_obs)])
                chainer.computational_graph.build_computational_graph(variables)
        elif alg in ["Rainbow", "DQN-C", "C51", "ACER", "PPO"]:
            variables = misc.collect_variables([model(np.zeros((4, 84, 84), dtype=np.float32)[None])])
            chainer.computational_graph.build_computational_graph(variables)
        else:
            fake_obs = np.zeros((4, 84, 84), dtype=np.float32)[None]
            fake_taus = np.zeros(32, dtype=np.float32)[None]
            variables = misc.collect_variables([model(fake_obs)(fake_taus)])

        def phi(x):
            # Feature extractor
            return np.asarray(x, dtype=np.float32) / 255

        opt = optimizers.RMSpropGraves()
        opt.setup(model)
        rbuf = replay_buffer.ReplayBuffer(1)
        if alg == "IQN":
            self.agent = agents.IQN(model, opt, rbuf, gpu=gpu, gamma=0.99, act_deterministically=True, explorer=None,
                                    replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True,
                                    update_interval=4, phi=phi)
        if alg == "A3C":
            self.agent = a3c.A3C(model, opt, t_max=5, gamma=0.99, phi=phi, act_deterministically=True)
        if alg == "Rainbow":
            self.agent = agents.CategoricalDoubleDQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None,
                                                     replay_start_size=1, minibatch_size=1, target_update_interval=None,
                                                     clip_delta=True, update_interval=4, phi=phi)
        if alg == "DQN-C":
            self.agent = agents.DQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None, replay_start_size=1,
                                    minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4,
                                    phi=phi)
        if alg == "C51":
            self.agent = agents.CategoricalDQN(
                model, opt, rbuf, gpu=gpu, gamma=0.99,
                explorer=None, replay_start_size=1,
                minibatch_size=1,
                target_update_interval=None,
                clip_delta=True,
                update_interval=4,
                phi=phi,
            )
        if alg == "ACER":
            self.agent = agents.acer.ACER(model, opt, t_max=5, gamma=0.99,
                                          replay_buffer=rbuf,
                                          n_times_replay=4,
                                          replay_start_size=1,
                                          act_deterministically=True,
                                          phi=phi
                                          )
        if alg == "PPO":
            self.agent = agents.PPO(model, opt, gpu=gpu, phi=phi, update_interval=4, minibatch_size=1, clip_eps=0.1,
                                    recurrent=False, act_deterministically=True)
        self.agent.load(os.path.join(model_path, 'chainer', alg, env.replace("NoFrameskip-v4", ""), 'final'))
Ejemplo n.º 18
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--outdir', type=str, default='a3c_training', help='Directory path to save output files. If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='TTT-A3C-v0')
    parser.add_argument('--seed', type=int, default=17, help='Random seed [0, 2 ** 32)')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--steps', type=int, default=5*10 ** 5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--eval-interval', type=int, default=10 ** 5)
    parser.add_argument('--arch', type=str, default='FFSoftmax', choices=('FFSoftmax'))
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e0)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-1)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=1*1e-4)
    parser.add_argument('--weight-decay', type=float, default=0)
    parser.add_argument('--logger-level', type=int, default=logging.ERROR)
    parser.add_argument('--monitor', action='store_true')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer deterministic even with the same random seed.
    misc.set_random_seed(args.seed)
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2 ** 32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)

        # NOTE: uncomment the next line to start from a pretrained agent
        # env.set_agent(gym_ttt.pretrained_agent.get_pretrained_agent("./"))
        return env

    sample_env = gym.make(args.env)

    # number of steps after which an episode is ended (whether the game is over or not)
    timestep_limit = sample_env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')

    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    # Initialize the NN and the optimizer
    model = A3CFFSoftmax(obs_space.low.size, action_space.n)
    opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    agent = a3c.A3C(model, opt, t_max=args.t_max, gamma=0.8, beta=args.beta)
    if args.load:
        agent.load(args.load)

    # draw the policy and state value network
    chainerrl.misc.draw_computational_graph(
        [agent.model.pi_and_v(np.array([np.array([[0. for _ in range(3)] for _ in range(3)], dtype=np.float32)]))[0]],
        os.path.join(args.outdir, 'model_pi'))
    chainerrl.misc.draw_computational_graph(
        [agent.model.pi_and_v(np.array([np.array([[0. for _ in range(3)] for _ in range(3)], dtype=np.float32)]))[1]],
        os.path.join(args.outdir, 'model_v'))

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env             = env,
            agent           = agent,
            n_steps         = None,
            n_episodes      = args.eval_n_runs,
            max_episode_len = timestep_limit)

        print('n_runs: {} mean: {} median: {} stdev {}'.format(args.eval_n_runs, eval_stats['mean'], eval_stats['median'],eval_stats['stdev']))

    else:
        experiments.train_agent_async(
            agent           = agent,
            outdir          = args.outdir,
            processes       = args.processes,
            make_env        = make_env,
            profile         = args.profile,
            steps           = args.steps,
            eval_n_runs     = args.eval_n_runs,
            eval_interval   = args.eval_interval,
            max_episode_len = timestep_limit)
Ejemplo n.º 19
0
            ndim_obs, n_actions, hidden_sizes, nonlinearity=F.tanh))
        self.v = links.MLP(ndim_obs,
                           1,
                           hidden_sizes=hidden_sizes,
                           nonlinearity=F.tanh)
        super().__init__(self.pi, self.v)

    def pi_and_v(self, state):
        return self.pi(state), self.v(state)


# load trained A3C agent
model = A3CFFSoftmax(ndim_obs=9, n_actions=9)
opt = rmsprop_async.RMSpropAsync()
opt.setup(model)
agent = a3c.A3C(model, opt, t_max=5, gamma=0.99)
agent.load(agent_path)


def check_game(field, action):
    global player_symbol, CPU_symbol, state

    # action is linearized to 0 to 8: break down into 3 x 3 array
    row, col = int(int(action) / 3), int(action) % 3

    if field["text"] == " ":  # user performed legal move
        field["text"] = player_symbol
        field["state"] = "disabled"
        state[row][col] = -1

    # player won
Ejemplo n.º 20
0
def main():

    # Prevent numpy from using multiple threads
    os.environ['OMP_NUM_THREADS'] = '1'

    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('rom', type=str)
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--use-sdl', action='store_true')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--max-episode-len', type=int, default=10000)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--eval-interval', type=int, default=10**6)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--use-lstm', action='store_true')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.set_defaults(use_sdl=False)
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    print('Output files are saved in {}'.format(args.outdir))

    n_actions = ale.ALE(args.rom).number_of_actions

    if args.use_lstm:
        model = A3CLSTM(n_actions)
    else:
        model = A3CFF(n_actions)
    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))
    agent = a3c.A3C(model,
                    opt,
                    t_max=args.t_max,
                    gamma=0.99,
                    beta=args.beta,
                    phi=dqn_phi)
    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        env = ale.ALE(args.rom,
                      use_sdl=args.use_sdl,
                      treat_life_lost_as_terminal=not test)
        if not test:
            misc.env_modifiers.make_reward_clipped(env, -1, 1)
        return env

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=args.max_episode_len,
                                      global_step_hooks=[lr_decay_hook])
Ejemplo n.º 21
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--processes', type=int, default=16)
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--eval-interval', type=int, default=250000)
    parser.add_argument('--eval-n-steps', type=int, default=125000)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    n_actions = gym.make(args.env).action_space.n

    model = A3CFF(n_actions)

    # Draw the computational graph and save it in the output directory.
    fake_obs = chainer.Variable(np.zeros((4, 84, 84), dtype=np.float32)[None],
                                name='observation')
    with chainerrl.recurrent.state_reset(model):
        # The state of the model is reset again after drawing the graph
        chainerrl.misc.draw_computational_graph([model(fake_obs)],
                                                os.path.join(
                                                    args.outdir, 'model'))

    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = a3c.A3C(model,
                    opt,
                    t_max=args.t_max,
                    gamma=0.99,
                    beta=args.beta,
                    phi=phi)

    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_steps=None,
                                                  n_episodes=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev: {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=False,
        )
Ejemplo n.º 22
0
                                       clip_rewards=False)
    env.seed(seed)
    return env


seed = 0
env_name = 'BreakoutNoFrameskip-v4'

misc.set_random_seed(seed)

env = make_env()
n_actions = env.action_space.n

model = A3CFF(n_actions)
opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
opt.setup(model)
opt.add_hook(chainer.optimizer.GradientClipping(40))

agent = a3c.A3C(model, opt, t_max=5, gamma=0.99, beta=1e-2, phi=phi)

agent.load('parameters')

ACTION_MEANINGS = {
    0: 'NOOP',
    1: 'FIRE',
    2: 'RIGHT',
    3: 'LEFT',
}

launch_visualizer(agent, env, ACTION_MEANINGS, raw_image_input=True)