Example #1
0
def create_state_action_q_function_for_env(env):
    assert isinstance(env.observation_space, gym.spaces.Box)
    assert isinstance(env.action_space, gym.spaces.Box)
    ndim_obs = env.observation_space.low.size
    return q_functions.FCSAQFunction(n_dim_obs=ndim_obs,
                                     n_dim_action=env.action_space.low.size,
                                     n_hidden_channels=200,
                                     n_hidden_layers=2)
Example #2
0
def make_acer_agent(obs_space_dim, action_space_dim):
    def phi(obs):
        return obs.astype(np.float32, copy=False)

    obs_low = np.array([-1] * obs_space_dim)
    obs_high = np.array([1] * obs_space_dim)
    ac_low = np.array([-1] * action_space_dim)
    ac_high = np.array([1] * action_space_dim)
    obs_space = gym.spaces.Box(obs_low, obs_high)
    action_space = gym.spaces.Box(ac_low, ac_high)
    model = chainerrl.agents.acer.ACERSDNSeparateModel(
        pi=policies.FCGaussianPolicy(obs_space.low.size,
                                     action_space.low.size,
                                     n_hidden_channels=50,
                                     n_hidden_layers=2,
                                     bound_mean=True,
                                     min_action=action_space.low,
                                     max_action=action_space.high),
        v=v_functions.FCVFunction(obs_space.low.size,
                                  n_hidden_channels=50,
                                  n_hidden_layers=2),
        adv=q_functions.FCSAQFunction(obs_space.low.size,
                                      action_space.low.size,
                                      n_hidden_channels=50 // 4,
                                      n_hidden_layers=2),
    )

    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))

    replay_buffer = EpisodicReplayBuffer(10)
    agent = chainerrl.agents.acer.ACER(model,
                                       opt,
                                       t_max=5,
                                       gamma=1,
                                       replay_buffer=replay_buffer,
                                       n_times_replay=1,
                                       replay_start_size=50,
                                       disable_online_update=False,
                                       use_trust_region=True,
                                       trust_region_delta=0.1,
                                       truncation_threshold=5,
                                       beta=0.5,
                                       phi=phi)
    return agent
Example #3
0
# Set a random seed used in ChainerRL
misc.set_random_seed(seed)

# Environment Initialization

env = make_env(test=False, render=False)
#timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
obs_size = np.asarray(env.observation_space.shape).prod()
action_space = env.action_space

action_size = np.asarray(action_space.shape).prod()

# Critic Network

q_func = q_functions.FCSAQFunction(obs_size,
                                   action_size,
                                   n_hidden_channels=critic_hidden_units,
                                   n_hidden_layers=critic_hidden_layers)

pi = policy.FCDeterministicPolicy(obs_size,
                                  action_size=action_size,
                                  n_hidden_channels=actor_hidden_units,
                                  n_hidden_layers=actor_hidden_layers,
                                  min_action=action_space.low,
                                  max_action=action_space.high,
                                  bound_action=True)

# The Model

model = DDPGModel(q_func=q_func, policy=pi)
opt_actor = optimizers.Adam(alpha=actor_lr)
opt_critic = optimizers.Adam(alpha=critic_lr)
Example #4
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='Humanoid-v2')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=10**6)
    parser.add_argument('--actor-lr', type=float, default=1e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument('--n-hidden-channels', type=int, default=300)
    parser.add_argument('--n-hidden-layers', type=int, default=3)
    parser.add_argument('--replay-start-size', type=int, default=5000)
    parser.add_argument('--n-update-times', type=int, default=1)
    parser.add_argument('--target-update-interval', type=int, default=1)
    parser.add_argument('--target-update-method',
                        type=str,
                        default='soft',
                        choices=['hard', 'soft'])
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=4)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--gamma', type=float, default=0.995)
    parser.add_argument('--minibatch-size', type=int, default=200)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--use-bn', action='store_true', default=False)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    args = parser.parse_args()

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def reward_filter(r):
        return r * args.reward_scale_factor

    def make_env(test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_size = np.asarray(env.observation_space.shape).prod()
    action_space = env.action_space

    action_size = np.asarray(action_space.shape).prod()
    if args.use_bn:
        q_func = q_functions.FCBNLateActionSAQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            normalize_input=True)
        pi = policy.FCBNDeterministicPolicy(
            obs_size,
            action_size=action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            min_action=action_space.low,
            max_action=action_space.high,
            bound_action=True,
            normalize_input=True)
    else:
        q_func = q_functions.FCSAQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        pi = policy.FCDeterministicPolicy(
            obs_size,
            action_size=action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            min_action=action_space.low,
            max_action=action_space.high,
            bound_action=True)
    model = DDPGModel(q_func=q_func, policy=pi)
    opt_a = optimizers.Adam(alpha=args.actor_lr)
    opt_c = optimizers.Adam(alpha=args.critic_lr)
    opt_a.setup(model['policy'])
    opt_c.setup(model['q_function'])
    opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
    opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')

    rbuf = replay_buffer.ReplayBuffer(5 * 10**5)

    def random_action():
        a = action_space.sample()
        if isinstance(a, np.ndarray):
            a = a.astype(np.float32)
        return a

    ou_sigma = (action_space.high - action_space.low) * 0.2
    explorer = explorers.AdditiveOU(sigma=ou_sigma)
    agent = DDPG(model,
                 opt_a,
                 opt_c,
                 rbuf,
                 gamma=args.gamma,
                 explorer=explorer,
                 replay_start_size=args.replay_start_size,
                 target_update_method=args.target_update_method,
                 target_update_interval=args.target_update_interval,
                 update_interval=args.update_interval,
                 soft_update_tau=args.soft_update_tau,
                 n_times_update=args.n_update_times,
                 gpu=args.gpu,
                 minibatch_size=args.minibatch_size)

    if len(args.load) > 0:
        agent.load(args.load)

    eval_env = make_env(test=True)
    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_env=eval_env,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            train_max_episode_len=timestep_limit)
Example #5
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir', type=str, default='out')
    parser.add_argument('--env', type=str, default='Humanoid-v1')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=10**6)
    parser.add_argument('--actor-lr', type=float, default=1e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument('--n-hidden-channels', type=int, default=300)
    parser.add_argument('--n-hidden-layers', type=int, default=3)
    parser.add_argument('--replay-start-size', type=int, default=5000)
    parser.add_argument('--n-update-times', type=int, default=1)
    parser.add_argument('--target-update-frequency', type=int, default=1)
    parser.add_argument('--target-update-method',
                        type=str,
                        default='soft',
                        choices=['hard', 'soft'])
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-frequency', type=int, default=4)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-frequency', type=int, default=10**5)
    parser.add_argument('--gamma', type=float, default=0.995)
    parser.add_argument('--minibatch-size', type=int, default=200)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--use-bn', action='store_true', default=False)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    args = parser.parse_args()

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def reward_filter(r):
        return r * args.reward_scale_factor

    def make_env():
        env = gym.make(args.env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        misc.env_modifiers.make_reward_filtered(env, reward_filter)
        if args.render:
            misc.env_modifiers.make_rendered(env)

        def __exit__(self, *args):
            pass

        env.__exit__ = __exit__
        return env

    env = make_env()
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_size = np.asarray(env.observation_space.shape).prod()
    action_space = env.action_space

    action_size = np.asarray(action_space.shape).prod()
    if args.use_bn:
        q_func = q_functions.FCBNLateActionSAQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            normalize_input=True)
        pi = policy.FCBNDeterministicPolicy(
            obs_size,
            action_size=action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            min_action=action_space.low,
            max_action=action_space.high,
            bound_action=True,
            normalize_input=True)
    else:
        q_func = q_functions.FCSAQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        pi = policy.FCDeterministicPolicy(
            obs_size,
            action_size=action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            min_action=action_space.low,
            max_action=action_space.high,
            bound_action=True)
    model = DDPGModel(q_func=q_func, policy=pi)
    opt_a = optimizers.Adam(alpha=args.actor_lr)
    opt_c = optimizers.Adam(alpha=args.critic_lr)
    opt_a.setup(model['policy'])
    opt_c.setup(model['q_function'])
    opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
    opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')

    rbuf = replay_buffer.ReplayBuffer(5 * 10**5)

    def phi(obs):
        return obs.astype(np.float32)

    def random_action():
        a = action_space.sample()
        if isinstance(a, np.ndarray):
            a = a.astype(np.float32)
        return a

    ou_sigma = (action_space.high - action_space.low) * 0.2
    explorer = explorers.AdditiveOU(sigma=ou_sigma)
    agent = DDPG(model,
                 opt_a,
                 opt_c,
                 rbuf,
                 gamma=args.gamma,
                 explorer=explorer,
                 replay_start_size=args.replay_start_size,
                 target_update_method=args.target_update_method,
                 target_update_frequency=args.target_update_frequency,
                 update_frequency=args.update_frequency,
                 soft_update_tau=args.soft_update_tau,
                 n_times_update=args.n_update_times,
                 phi=phi,
                 gpu=args.gpu,
                 minibatch_size=args.minibatch_size)
    agent.logger.setLevel(logging.DEBUG)

    if len(args.load) > 0:
        agent.load(args.load)

    if args.demo:
        mean, median, stdev = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev'.format(
            args.eval_n_runs, mean, median, stdev))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_frequency=args.eval_frequency,
            outdir=args.outdir,
            max_episode_len=timestep_limit)
Example #6
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--t-max', type=int, default=50)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--replay-capacity', type=int, default=5000)
    parser.add_argument('--replay-start-size', type=int, default=10**3)
    parser.add_argument('--disable-online-update', action='store_true')
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-2)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--truncation-threshold', type=float, default=5)
    parser.add_argument('--trust-region-delta', type=float, default=0.1)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    if isinstance(action_space, spaces.Box):
        model = acer.ACERSDNSeparateModel(
            pi=policies.FCGaussianPolicy(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
                bound_mean=True,
                min_action=action_space.low,
                max_action=action_space.high),
            v=v_functions.FCVFunction(obs_space.low.size,
                                      n_hidden_channels=args.n_hidden_channels,
                                      n_hidden_layers=args.n_hidden_layers),
            adv=q_functions.FCSAQFunction(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels // 4,
                n_hidden_layers=args.n_hidden_layers),
        )
    else:
        model = acer.ACERSeparateModel(
            pi=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), SoftmaxDistribution),
            q=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), DiscreteActionValue),
        )

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))

    replay_buffer = EpisodicReplayBuffer(args.replay_capacity)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      disable_online_update=args.disable_online_update,
                      use_trust_region=True,
                      trust_region_delta=args.trust_region_delta,
                      truncation_threshold=args.truncation_threshold,
                      beta=args.beta)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_steps=None,
                                      eval_n_episodes=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)
Example #7
0
import chainer
from chainer import optimizers
import numpy as np

args = parser()

env = ProstheticsEnv(visualize=True)
env.change_model(model='3D', prosthetic=True, difficulty=0, seed=2**32 - 1)
phi = lambda x: np.array(x).astype(np.float32, copy=False)
obs_size = np.asarray(env.observation_space.shape).prod()
action_space = env.action_space
action_size = np.asarray(action_space.shape).prod()

#Critic Network
q_func = q_functions.FCSAQFunction(160,
                                   action_size,
                                   n_hidden_channels=args.hidden_size,
                                   n_hidden_layers=args.hidden_size)

#Policy Network
pi = policy.FCDeterministicPolicy(160,
                                  action_size=action_size,
                                  n_hidden_channels=args.hidden_size,
                                  n_hidden_layers=args.hidden_size,
                                  min_action=action_space.low,
                                  max_action=action_space.high,
                                  bound_action=True)

#Model
model = DDPGModel(q_func=q_func, policy=pi)
opt_actor = optimizers.Adam(alpha=args.actor_lr)
opt_critic = optimizers.Adam(alpha=args.critic_lr)
Example #8
0
args = parser()

env = ProstheticsEnv(visualize=False)
env.change_model(model='3D', prosthetic=True, difficulty=0,
                 seed=None)  #Se empieza con dificultad 0, prótesis y 3D
#El action space para el modelo escogido es vector de lenght 19
observation = env.reset(project=True)

# Sizes environment
#np.asarray(env.observation_space.shape).prod()
print(args.obs_size)
action_size = np.asarray(env.action_space.shape).prod()  #19

# Función Q
q_func = q_functions.FCSAQFunction(args.obs_size,
                                   action_size,
                                   n_hidden_channels=args.hidd_lay,
                                   n_hidden_layers=args.c_hidd_lay)
q_func.to_gpu(0)

# Policy
pi = policy.FCDeterministicPolicy(args.obs_size,
                                  action_size=action_size,
                                  n_hidden_channels=args.hidd_lay,
                                  n_hidden_layers=args.c_hidd_lay,
                                  min_action=env.action_space.low,
                                  max_action=env.action_space.high,
                                  bound_action=True)

print(env.action_space)

# El Modelo
Example #9
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='FetchPickAndPlace-v1')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps',
                        type=int, default=10 ** 6)
    parser.add_argument('--actor-lr', type=float, default=1e-3)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--steps', type=int, default=200 * 50 * 16 * 50)
    parser.add_argument('--n-hidden-channels', type=int, default=64)
    parser.add_argument('--n-hidden-layers', type=int, default=3)
    parser.add_argument('--replay-start-size', type=int, default=10000)
    parser.add_argument('--n-update-times', type=int, default=40)
    parser.add_argument('--target-update-interval',
                        type=int, default=16 * 50)
    parser.add_argument('--target-update-method',
                        type=str, default='soft', choices=['hard', 'soft'])
    parser.add_argument('--soft-update-tau', type=float, default=1 - 0.95)
    parser.add_argument('--update-interval', type=int, default=16 * 50)
    parser.add_argument('--eval-n-runs', type=int, default=30)
    parser.add_argument('--eval-interval', type=int, default=50 * 16 * 50)
    parser.add_argument('--gamma', type=float, default=0.98)
    parser.add_argument('--minibatch-size', type=int, default=128)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--epsilon', type=float, default=0.05)
    parser.add_argument('--noise-std', type=float, default=0.05)
    parser.add_argument('--clip-threshold', type=float, default=5.0)
    parser.add_argument('--num-envs', type=int, default=1)
    args = parser.parse_args()

    args.outdir = experiments.prepare_output_dir(
        args, args.outdir, argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2 ** 32

    def make_env(idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[idx])
        env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if args.render and not test:
            env = chainerrl.wrappers.Render(env)
        if test:
            env = HEREnvWrapper(env, args.outdir)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv(
            [(lambda: make_env(idx, test))
             for idx, env in enumerate(range(args.num_envs))])

    sample_env = make_env(0, test=False)

    def reward_function(state, action, goal):
        return sample_env.compute_reward(achieved_goal=state['achieved_goal'],
                                  desired_goal=goal,
                                  info=None)

    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    space_dict = sample_env.observation_space.spaces
    observation_space = space_dict['observation']
    goal_space = space_dict['desired_goal']
    obs_size = np.asarray(observation_space.shape).prod()
    goal_size = np.asarray(goal_space.shape).prod()
    action_space = sample_env.action_space

    action_size = np.asarray(action_space.shape).prod()    
    q_func = q_functions.FCSAQFunction(
        obs_size + goal_size, action_size,
        n_hidden_channels=args.n_hidden_channels,
        n_hidden_layers=args.n_hidden_layers)
    pi = policy.FCDeterministicPolicy(
        obs_size + goal_size, action_size=action_size,
        n_hidden_channels=args.n_hidden_channels,
        n_hidden_layers=args.n_hidden_layers,
        min_action=action_space.low, max_action=action_space.high,
        bound_action=True)
    model = DDPGModel(q_func=q_func, policy=pi)
    opt_a = optimizers.Adam(alpha=args.actor_lr)
    opt_c = optimizers.Adam(alpha=args.critic_lr)
    opt_a.setup(model['policy'])
    opt_c.setup(model['q_function'])
    opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
    opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')

    rbuf = replay_buffer.HindsightReplayBuffer(reward_function,
        10 ** 6,
        future_k=4)

    def phi(dict_state):
        return np.concatenate(
            (dict_state['observation'].astype(np.float32, copy=False),
            dict_state['desired_goal'].astype(np.float32, copy=False)), 0)

    # Normalize observations based on their empirical mean and variance
    obs_normalizer = chainerrl.links.EmpiricalNormalization(
        obs_size + goal_size, clip_threshold=args.clip_threshold)

    explorer = HERExplorer(args.noise_std,
        args.epsilon,
        action_space)
    agent = DDPG(model, opt_a, opt_c, rbuf,
                 obs_normalizer=obs_normalizer,
                 gamma=args.gamma,
                 explorer=explorer,
                 replay_start_size=args.replay_start_size,
                 phi=phi,
                 target_update_method=args.target_update_method,
                 target_update_interval=args.target_update_interval,
                 update_interval=args.update_interval,
                 soft_update_tau=args.soft_update_tau,
                 n_times_update=args.n_update_times,
                 gpu=args.gpu,
                 minibatch_size=args.minibatch_size,
                 clip_critic_tgt=(-1.0/(1.0-args.gamma), 0.0))

    if len(args.load) > 0:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=make_batch_env(test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_batch_with_evaluation(
            agent=agent, env=make_batch_env(test=False), steps=args.steps,
            eval_env=make_batch_env(test=True), eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval,
            outdir=args.outdir,
            max_episode_len=timestep_limit)
Example #10
0
print('done:', done)
print('info:', info)

print('actions:', str(env.action_space))

timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps'
	)
obs_size = np.asarray(env.observation_space.shape).prod()
action_space = env.action_space
action_size = np.asarray(action_space.n).prod()

# Set the Q function up
q_func = q_functions.FCSAQFunction(
		obs_size, 
		action_size,
		n_hidden_channels=n_hidden_channels,
		n_hidden_layers=n_hidden_layers
	)
	
# Set the policy up
pi = policy.FCDeterministicPolicy(
		obs_size, 
		action_size=action_size,
		n_hidden_channels=n_hidden_channels,
		n_hidden_layers=n_hidden_layers,
		min_action=action_space.low, 
		max_action=action_space.high,
		bound_action=True
	)
	
# Set up the optimizers
Example #11
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--t-max', type=int, default=50)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--replay-capacity', type=int, default=5000)
    parser.add_argument('--replay-start-size', type=int, default=10**3)
    parser.add_argument('--disable-online-update', action='store_true')
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-2)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--truncation-threshold', type=float, default=5)
    parser.add_argument('--trust-region-delta', type=float, default=0.1)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            misc.env_modifiers.make_rendered(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    if isinstance(action_space, spaces.Box):
        model = acer.ACERSDNSeparateModel(
            pi=policies.FCGaussianPolicy(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
                bound_mean=True,
                min_action=action_space.low,
                max_action=action_space.high),
            v=v_functions.FCVFunction(obs_space.low.size,
                                      n_hidden_channels=args.n_hidden_channels,
                                      n_hidden_layers=args.n_hidden_layers),
            adv=q_functions.FCSAQFunction(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels // 4,
                n_hidden_layers=args.n_hidden_layers),
        )
    else:
        model = acer.ACERSeparateModel(
            pi=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), SoftmaxDistribution),
            q=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), DiscreteActionValue),
        )

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))

    replay_buffer = EpisodicReplayBuffer(args.replay_capacity)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      disable_online_update=args.disable_online_update,
                      use_trust_region=True,
                      trust_region_delta=args.trust_region_delta,
                      truncation_threshold=args.truncation_threshold,
                      beta=args.beta,
                      phi=phi)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)
Example #12
0
    cur_vel = (cur_vel_x**2 + cur_vel_z**2)**0.5
    return cur_vel


# Set a random seed used in ChainerRL
misc.set_random_seed(seed)

# Setup the environment
env = make_env(test=False, render=False)
obs_size = np.asarray(env.observation_space.shape).prod()
action_space = env.action_space
action_size = np.asarray(action_space.shape).prod()

# Critic Network
q_func = q_functions.FCSAQFunction(160,
                                   action_size,
                                   n_hidden_channels=args.critic_hidden_units,
                                   n_hidden_layers=args.critic_hidden_layers)

# q_func = q_functions.FCLSTMSAQFunction(
#             160,
#             action_size,
#             n_hidden_channels=args.critic_hidden_units,
#             n_hidden_layers=args.critic_hidden_layers)

#q_func.to_gpu(0) #Poner en la GPU

# Policy Network
# pi = policy.FCDeterministicPolicy(
#             160,
#             action_size=action_size,
#             n_hidden_channels=args.actor_hidden_units,
def main(args):
    import logging
    logging.basicConfig(level=logging.INFO, filename='log')

    if (type(args) is list):
        args = make_args(args)

    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**32

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    if isinstance(action_space, spaces.Box):
        model = acer.ACERSDNSeparateModel(
            pi=policies.FCGaussianPolicy(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
                bound_mean=True,
                min_action=action_space.low,
                max_action=action_space.high),
            v=v_functions.FCVFunction(obs_space.low.size,
                                      n_hidden_channels=args.n_hidden_channels,
                                      n_hidden_layers=args.n_hidden_layers),
            adv=q_functions.FCSAQFunction(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels // 4,
                n_hidden_layers=args.n_hidden_layers),
        )
    else:
        model = acer.ACERSeparateModel(
            pi=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), SoftmaxDistribution),
            q=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), DiscreteActionValue),
        )

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))

    replay_buffer = EpisodicReplayBuffer(args.replay_capacity)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      disable_online_update=args.disable_online_update,
                      use_trust_region=True,
                      trust_region_delta=args.trust_region_delta,
                      truncation_threshold=args.truncation_threshold,
                      beta=args.beta)

    if args.load_agent:
        agent.load(args.load_agent)

    if (args.mode == 'train'):
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      step_offset=args.step_offset,
                                      checkpoint_freq=args.checkpoint_freq,
                                      log_type=args.log_type,
                                      eval_n_steps=None,
                                      eval_n_episodes=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)

    elif (args.mode == 'check'):
        from matplotlib import animation
        import matplotlib.pyplot as plt
        env = make_env(0, True)

        frames = []
        for i in range(3):
            obs = env.reset()
            done = False
            R = 0
            t = 0
            while not done and t < 200:
                frames.append(env.render(mode='rgb_array'))
                action = agent.act(obs)
                obs, r, done, _ = env.step(action)
                R += r
                t += 1
            print('test episode:', i, 'R:', R)
            agent.stop_episode()
        env.close()

        from IPython.display import HTML
        plt.figure(figsize=(frames[0].shape[1] / 72.0,
                            frames[0].shape[0] / 72.0),
                   dpi=72)
        patch = plt.imshow(frames[0])
        plt.axis('off')

        def animate(i):
            patch.set_data(frames[i])

        anim = animation.FuncAnimation(plt.gcf(),
                                       animate,
                                       frames=len(frames),
                                       interval=50)
        anim.save(args.save_mp4)
        return anim