Exemple #1
0
 def create_agent(self, env):
     model = agents.ddpg.DDPGModel(
         policy=create_stochastic_policy_for_env(env),
         q_func=create_state_action_q_function_for_env(env))
     rbuf = replay_buffer.ReplayBuffer(10 ** 5)
     opt_a = optimizers.Adam()
     opt_a.setup(model.policy)
     opt_b = optimizers.Adam()
     opt_b.setup(model.q_function)
     explorer = explorers.AdditiveGaussian(scale=1)
     return agents.PGT(model, opt_a, opt_b, rbuf, gamma=0.99,
                       explorer=explorer)
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--env',
                        type=str,
                        default='Hopper-v2',
                        help='OpenAI Gym MuJoCo env to perform algorithm on.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--load',
                        type=str,
                        default='',
                        help='Directory to load agent from.')
    parser.add_argument('--steps',
                        type=int,
                        default=10**6,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--eval-n-runs',
                        type=int,
                        default=10,
                        help='Number of episodes run for each evaluation.')
    parser.add_argument('--eval-interval',
                        type=int,
                        default=5000,
                        help='Interval in timesteps between evaluations.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=10000,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--batch-size',
                        type=int,
                        default=100,
                        help='Minibatch size')
    parser.add_argument('--render',
                        action='store_true',
                        help='Render env states in a GUI window.')
    parser.add_argument('--demo',
                        action='store_true',
                        help='Just run evaluation, not training.')
    parser.add_argument('--monitor',
                        action='store_true',
                        help='Wrap env with gym.wrappers.Monitor.')
    parser.add_argument('--logger-level',
                        type=int,
                        default=logging.INFO,
                        help='Level of the root logger.')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    def make_env(test):
        env = gym.make(args.env)
        # Unwrap TimeLimit wrapper
        assert isinstance(env, gym.wrappers.TimeLimit)
        env = env.env
        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if args.render and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = env.observation_space
    action_space = env.action_space
    print('Observation space:', obs_space)
    print('Action space:', action_space)

    action_size = action_space.low.size

    winit = chainer.initializers.LeCunUniform(3**-0.5)

    policy = chainer.Sequential(
        L.Linear(None, 400, initialW=winit),
        F.relu,
        L.Linear(None, 300, initialW=winit),
        F.relu,
        L.Linear(None, action_size, initialW=winit),
        F.tanh,
        chainerrl.distribution.ContinuousDeterministicDistribution,
    )
    policy_optimizer = optimizers.Adam().setup(policy)

    def make_q_func_with_optimizer():
        q_func = chainer.Sequential(
            concat_obs_and_action,
            L.Linear(None, 400, initialW=winit),
            F.relu,
            L.Linear(None, 300, initialW=winit),
            F.relu,
            L.Linear(None, 1, initialW=winit),
        )
        q_func_optimizer = optimizers.Adam().setup(q_func)
        return q_func, q_func_optimizer

    q_func1, q_func1_optimizer = make_q_func_with_optimizer()
    q_func2, q_func2_optimizer = make_q_func_with_optimizer()

    # Draw the computational graph and save it in the output directory.
    fake_obs = chainer.Variable(policy.xp.zeros_like(obs_space.low,
                                                     dtype=np.float32)[None],
                                name='observation')
    fake_action = chainer.Variable(policy.xp.zeros_like(
        action_space.low, dtype=np.float32)[None],
                                   name='action')
    chainerrl.misc.draw_computational_graph([policy(fake_obs)],
                                            os.path.join(
                                                args.outdir, 'policy'))
    chainerrl.misc.draw_computational_graph([q_func1(fake_obs, fake_action)],
                                            os.path.join(
                                                args.outdir, 'q_func1'))
    chainerrl.misc.draw_computational_graph([q_func2(fake_obs, fake_action)],
                                            os.path.join(
                                                args.outdir, 'q_func2'))

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.AdditiveGaussian(scale=0.1,
                                          low=action_space.low,
                                          high=action_space.high)

    def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low,
                                 action_space.high).astype(np.float32)

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    agent = chainerrl.agents.TD3(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=0.99,
        soft_update_tau=5e-3,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        gpu=args.gpu,
        minibatch_size=args.batch_size,
        burnin_action_func=burnin_action_func,
    )

    if len(args.load) > 0:
        agent.load(args.load)

    eval_env = make_env(test=True)
    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_env=eval_env,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            train_max_episode_len=timestep_limit)
Exemple #3
0
    def _test_load_ddpg(self, gpu):
        def concat_obs_and_action(obs, action):
            return F.concat((obs, action), axis=-1)

        action_size = 3
        winit = chainer.initializers.LeCunUniform(3**-0.5)
        q_func = chainer.Sequential(
            concat_obs_and_action,
            L.Linear(None, 400, initialW=winit),
            F.relu,
            L.Linear(None, 300, initialW=winit),
            F.relu,
            L.Linear(None, 1, initialW=winit),
        )
        policy = chainer.Sequential(
            L.Linear(None, 400, initialW=winit),
            F.relu,
            L.Linear(None, 300, initialW=winit),
            F.relu,
            L.Linear(None, action_size, initialW=winit),
            F.tanh,
            chainerrl.distribution.ContinuousDeterministicDistribution,
        )
        from chainerrl.agents.ddpg import DDPGModel
        model = DDPGModel(q_func=q_func, policy=policy)

        obs_low = [-np.inf] * 11
        fake_obs = chainer.Variable(model.xp.zeros_like(
            obs_low, dtype=np.float32)[None],
                                    name='observation')
        fake_action = chainer.Variable(model.xp.zeros_like(
            [-1., -1., -1.], dtype=np.float32)[None],
                                       name='action')
        policy(fake_obs)
        q_func(fake_obs, fake_action)

        opt_a = optimizers.Adam()
        opt_c = optimizers.Adam()
        opt_a.setup(model['policy'])
        opt_c.setup(model['q_function'])

        explorer = explorers.AdditiveGaussian(scale=0.1,
                                              low=[-1., -1., -1.],
                                              high=[1., 1., 1.])

        agent = agents.DDPG(model,
                            opt_a,
                            opt_c,
                            replay_buffer.ReplayBuffer(100),
                            gamma=0.99,
                            explorer=explorer,
                            replay_start_size=1000,
                            target_update_method='soft',
                            target_update_interval=1,
                            update_interval=1,
                            soft_update_tau=5e-3,
                            n_times_update=1,
                            gpu=gpu,
                            minibatch_size=100,
                            burnin_action_func=None)

        model, exists = download_model("DDPG",
                                       "Hopper-v2",
                                       model_type=self.pretrained_type)
        agent.load(model)
        if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'):
            assert exists
Exemple #4
0
    def _test_load_td3(self, gpu):
        def concat_obs_and_action(obs, action):
            """Concat observation and action to feed the critic."""
            return F.concat((obs, action), axis=-1)

        def make_q_func_with_optimizer():
            q_func = chainer.Sequential(
                concat_obs_and_action,
                L.Linear(None, 400, initialW=winit),
                F.relu,
                L.Linear(None, 300, initialW=winit),
                F.relu,
                L.Linear(None, 1, initialW=winit),
            )
            q_func_optimizer = optimizers.Adam().setup(q_func)
            return q_func, q_func_optimizer

        winit = chainer.initializers.LeCunUniform(3**-0.5)

        q_func1, q_func1_optimizer = make_q_func_with_optimizer()
        q_func2, q_func2_optimizer = make_q_func_with_optimizer()

        action_size = 3
        policy = chainer.Sequential(
            L.Linear(None, 400, initialW=winit),
            F.relu,
            L.Linear(None, 300, initialW=winit),
            F.relu,
            L.Linear(None, action_size, initialW=winit),
            F.tanh,
            chainerrl.distribution.ContinuousDeterministicDistribution,
        )

        policy_optimizer = optimizers.Adam().setup(policy)

        rbuf = replay_buffer.ReplayBuffer(100)
        explorer = explorers.AdditiveGaussian(scale=0.1,
                                              low=[-1., -1., -1.],
                                              high=[1., 1., 1.])

        agent = agents.TD3(policy,
                           q_func1,
                           q_func2,
                           policy_optimizer,
                           q_func1_optimizer,
                           q_func2_optimizer,
                           rbuf,
                           gamma=0.99,
                           soft_update_tau=5e-3,
                           explorer=explorer,
                           replay_start_size=10000,
                           gpu=gpu,
                           minibatch_size=100,
                           burnin_action_func=None)

        model, exists = download_model("TD3",
                                       "Hopper-v2",
                                       model_type=self.pretrained_type)
        agent.load(model)
        if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'):
            assert exists
Exemple #5
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='Inv',
                        help='OpenAI Gym MuJoCo env to perform algorithm on.')
    parser.add_argument(
        '--outdir',
        type=str,
        default='results',
        help=
        'Directory path to save output files. it will be created if not existent.'
    )
    parser.add_argument('--seed',
                        type=int,
                        default=420,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu',
                        type=int,
                        default=-1,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--load',
                        type=str,
                        default='',
                        help='Directory to load agent from.')
    parser.add_argument('--steps',
                        type=int,
                        default=10**5,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--eval-n-runs',
                        type=int,
                        default=10,
                        help='Number of episodes run for each evaluation.')
    parser.add_argument('--eval-interval',
                        type=int,
                        default=100,
                        help='Interval in timesteps between evaluations.')
    parser.add_argument(
        '--replay-start-size',
        type=int,
        default=1000,
        help='Minimum replay buffer size before performing gradient updates.')
    parser.add_argument('--batch-size',
                        type=int,
                        default=4,
                        help='Minibatch size')
    parser.add_argument('--logger-level',
                        type=int,
                        default=logging.INFO,
                        help='Level of the root logger.')
    parser.add_argument('--render',
                        action='store_true',
                        help='Render env states in a GUI window.')
    parser.add_argument('--demo',
                        action='store_true',
                        help='Just run evaluation, not training.')
    parser.add_argument('--monitor',
                        action='store_true',
                        help='Wrap env with gym.wrappers.Monitor.')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    def make_env(test):

        env = gym.make(
            "DaktyPushingSimulationEnv-v0",
            level=5,
            simulation_backend="mujoco",
            control_frequency_in_hertz=100,
            state_space_components_to_be_used=None,
            alternate_env_object=None,
            discretization_factor_torque_control_space=None,
            model_as_function_for_pixel_to_latent_space_parsing=(None, None))

        print('\n############\n', env, '\n############\n')

        env.unwrapped.finger.set_resolution_quality('low')

        print('\n############\n', env, '\n############\n')

        env = gym.wrappers.TimeLimit(env)

        print('\n############\n', env, '\n############\n')

        # Unwrap TimeLimit wrapper
        assert isinstance(env, gym.wrappers.TimeLimit)
        env = env.env

        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if args.render and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = env.observation_space
    action_space = env.action_space
    print('Observation space:', obs_space)
    print('Action space:', action_space)

    action_size = action_space.low.size

    winit = chainer.initializers.LeCunUniform(3**-0.5)
    '''
    define policy and optimiser
    output_dim = action_size
    '''
    policy = chainer.Sequential(
        L.Linear(None, 128, initialW=winit),
        F.relu,
        L.Linear(None, 64, initialW=winit),
        F.relu,
        L.Linear(None, action_size, initialW=winit),
        F.tanh,
        chainerrl.distribution.ContinuousDeterministicDistribution,
    )
    policy_optimizer = optimizers.Adam(3e-4).setup(policy)

    # policy.to_gpu(0)
    '''
    define q-function and optimiser
    output_dim = 1
    defines 2 identical q_functions with resp. optimisers
    '''

    def make_q_func_with_optimizer():
        q_func = chainer.Sequential(
            concat_obs_and_action,
            L.Linear(None, 128, initialW=winit),
            F.relu,
            L.Linear(None, 64, initialW=winit),
            F.relu,
            L.Linear(None, 1, initialW=winit),
        )
        q_func_optimizer = optimizers.Adam().setup(q_func)
        return q_func, q_func_optimizer

    q_func1, q_func1_optimizer = make_q_func_with_optimizer()
    q_func2, q_func2_optimizer = make_q_func_with_optimizer()

    # q_func1.to_gpu(0)
    # q_func2.to_gpu(0)

    print('\n\n-------------------\n', obs_space.low.shape,
          '\n-------------------\n')

    # Draw the computational graph and save it in the output directory.
    fake_obs = chainer.Variable(policy.xp.zeros_like(obs_space.low,
                                                     dtype=np.float32)[None],
                                name='observation')
    fake_action = chainer.Variable(policy.xp.zeros_like(
        action_space.low, dtype=np.float32)[None],
                                   name='action')
    chainerrl.misc.draw_computational_graph([policy(fake_obs)],
                                            os.path.join(
                                                args.outdir, 'policy'))
    chainerrl.misc.draw_computational_graph([q_func1(fake_obs, fake_action)],
                                            os.path.join(
                                                args.outdir, 'q_func1'))
    chainerrl.misc.draw_computational_graph([q_func2(fake_obs, fake_action)],
                                            os.path.join(
                                                args.outdir, 'q_func2'))

    rbuf = replay_buffer.ReplayBuffer(10**5)

    explorer = explorers.AdditiveGaussian(scale=0.1,
                                          low=action_space.low,
                                          high=action_space.high)

    def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low,
                                 action_space.high).astype(np.float32)

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    agent = chainerrl.agents.TD3(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=0.99,
        soft_update_tau=5e-3,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        gpu=args.gpu,
        minibatch_size=args.batch_size,
        burnin_action_func=burnin_action_func,
    )

    # agent.to_gpu(0)

    if len(args.load) > 0:
        agent.load(args.load)

    sys.stdout.flush()

    print('\nbeginning training\n')

    n_episodes = 10000

    # pbar = tqdm(total=n_episodes)

    max_episode_len = 5000
    for i in range(1, n_episodes + 1):

        # pbar.update(1)

        obs = env.reset()
        # print('obs inital..............', obs.shape)
        reward = 0
        done = False
        R = 0  # return (sum of rewards)
        t = 0  # time step

        pbar = tqdm(total=max_episode_len)

        while not done and t < max_episode_len:

            pbar.update(1)

            # Uncomment to watch the behaviour
            # env.render()
            action = agent.act_and_train(obs, reward)
            # print('action..................', action)

            obs, reward, done, _ = env.step(action)
            # print('obs.....................', obs)
            # print('reward..................', reward)

            R += reward
            t += 1

        if i % 1 == 0:
            print('episode:', i, 'R:', R, 'statistics:',
                  agent.get_statistics())
        agent.stop_episode_and_train(obs, reward, done)
    print('Finished.')