Beispiel #1
0
def test_to():
    """Test the torch function that moves modules to GPU.

        Test that the policy and qfunctions are moved to gpu if gpu is
        available.

    """
    env_names = ['CartPole-v0', 'CartPole-v1']
    task_envs = [MetaRLEnv(env_name=name) for name in env_names]
    env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy)
    deterministic.set_seed(0)
    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[1, 1],
        hidden_nonlinearity=torch.nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[1, 1],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[1, 1],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

    num_tasks = 2
    buffer_batch_size = 2
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=150,
                  max_path_length=150,
                  eval_env=env,
                  env_spec=env.spec,
                  num_tasks=num_tasks,
                  steps_per_epoch=5,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1e3,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=buffer_batch_size)

    set_gpu_mode(torch.cuda.is_available())
    mtsac.to()
    device = global_device()
    for param in mtsac._qf1.parameters():
        assert param.device == device
    for param in mtsac._qf2.parameters():
        assert param.device == device
    for param in mtsac._qf2.parameters():
        assert param.device == device
    for param in mtsac._policy.parameters():
        assert param.device == device
    assert mtsac._log_alpha.device == device
Beispiel #2
0
def ml1_push_v1_sac(ctxt=None, seed=1):
    """Set up environment and algorithm and run the task."""
    runner = LocalRunner(ctxt)
    Ml1_reach_envs = get_ML1_envs("push-v1")
    Ml1_reach_test_envs = get_ML1_envs_test("push-v1")
    env = MTMetaWorldWrapper(Ml1_reach_envs)

    policy = TanhGaussianMLPPolicy2(
        env_spec=env.spec,
        hidden_sizes=[400, 400, 400],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    replay_buffer = SACReplayBuffer(env_spec=env.spec, max_size=int(1e6))
    sampler_args = {'agent': policy, 'max_path_length': 150}

    timesteps = 100000000
    batch_size = int(150 * env.num_tasks)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    sac = MTSAC(env=env,
                eval_env_dict=Ml1_reach_test_envs,
                env_spec=env.spec,
                policy=policy,
                qf1=qf1,
                qf2=qf2,
                gradient_steps_per_itr=250,
                epoch_cycles=epoch_cycles,
                use_automatic_entropy_tuning=True,
                replay_buffer=replay_buffer,
                min_buffer_size=7500,
                target_update_tau=5e-3,
                discount=0.99,
                buffer_batch_size=6400)
    tu.set_gpu_mode(True)
    sac.to('cuda:0')

    runner.setup(algo=sac,
                 env=env,
                 sampler_cls=SimpleSampler,
                 sampler_args=sampler_args)

    runner.train(n_epochs=epochs, batch_size=batch_size)
Beispiel #3
0
def test_fixed_alpha():
    """Test if using fixed_alpha ensures that alpha is non differentiable."""
    env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2']
    task_envs = [MetaRLEnv(env_name=name) for name in env_names]
    env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy)
    test_envs = MultiEnvWrapper(task_envs,
                                sample_strategy=round_robin_strategy)
    deterministic.set_seed(0)
    runner = LocalRunner(snapshot_config=snapshot_config)
    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[32, 32],
        hidden_nonlinearity=torch.nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[32, 32],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[32, 32],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
    num_tasks = 2
    buffer_batch_size = 128
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=100,
                  max_path_length=100,
                  eval_env=test_envs,
                  env_spec=env.spec,
                  num_tasks=num_tasks,
                  steps_per_epoch=1,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1e3,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=buffer_batch_size,
                  fixed_alpha=np.exp(0.5))
    if torch.cuda.is_available():
        set_gpu_mode(True)
    else:
        set_gpu_mode(False)
    mtsac.to()
    assert torch.allclose(torch.Tensor([0.5] * num_tasks),
                          mtsac._log_alpha.to('cpu'))
    runner.setup(mtsac, env, sampler_cls=LocalSampler)
    runner.train(n_epochs=1, batch_size=128, plot=False)
    assert torch.allclose(torch.Tensor([0.5] * num_tasks),
                          mtsac._log_alpha.to('cpu'))
    assert not mtsac._use_automatic_entropy_tuning
Beispiel #4
0
def test_mtsac_get_log_alpha(monkeypatch):
    """Check that the private function _get_log_alpha functions correctly.

    MTSAC uses disentangled alphas, meaning that

    """
    env_names = ['CartPole-v0', 'CartPole-v1']
    task_envs = [MetaRLEnv(env_name=name) for name in env_names]
    env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy)
    deterministic.set_seed(0)
    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[1, 1],
        hidden_nonlinearity=torch.nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[1, 1],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[1, 1],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

    num_tasks = 2
    buffer_batch_size = 2
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=150,
                  max_path_length=150,
                  eval_env=env,
                  env_spec=env.spec,
                  num_tasks=num_tasks,
                  steps_per_epoch=5,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1e3,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=buffer_batch_size)
    monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.]))
    for i, _ in enumerate(env_names):
        obs = torch.Tensor([env.reset()] * buffer_batch_size)
        log_alpha = mtsac._get_log_alpha(dict(observation=obs))
        assert (log_alpha == torch.Tensor([i + 1, i + 1])).all().item()
        assert log_alpha.size() == torch.Size([mtsac.buffer_batch_size])
Beispiel #5
0
def test_mtsac_inverted_double_pendulum():
    """Performance regression test of MTSAC on 2 InvDoublePendulum envs."""
    env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2']
    task_envs = [MetaRLEnv(env_name=name) for name in env_names]
    env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy)
    test_envs = MultiEnvWrapper(task_envs,
                                sample_strategy=round_robin_strategy)
    deterministic.set_seed(0)
    runner = LocalRunner(snapshot_config=snapshot_config)
    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[32, 32],
        hidden_nonlinearity=torch.nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[32, 32],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[32, 32],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
    num_tasks = 2
    buffer_batch_size = 128
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=100,
                  max_path_length=100,
                  eval_env=test_envs,
                  env_spec=env.spec,
                  num_tasks=num_tasks,
                  steps_per_epoch=5,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1e3,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=buffer_batch_size)
    runner.setup(mtsac, env, sampler_cls=LocalSampler)
    ret = runner.train(n_epochs=8, batch_size=128, plot=False)
    assert ret > 130
Beispiel #6
0
def mt10_sac(ctxt=None, seed=1):
    """Set up environment and algorithm and run the task."""
    runner = LocalRunner(ctxt)
    MT10_envs_by_id = {}
    MT10_envs_test = {}
    for (task, env) in EASY_MODE_CLS_DICT.items():
        MT10_envs_by_id[task] = MetaRLEnv(
            env(*EASY_MODE_ARGS_KWARGS[task]['args'],
                **EASY_MODE_ARGS_KWARGS[task]['kwargs']))
        # python 3.6 dicts are ordered
        MT10_envs_test[task] = MetaRLEnv(
            env(*EASY_MODE_ARGS_KWARGS[task]['args'],
                **EASY_MODE_ARGS_KWARGS[task]['kwargs']))

    env = IgnoreDoneWrapper(MTMetaWorldWrapper(MT10_envs_by_id))

    policy = TanhGaussianMLPPolicy2(
        env_spec=env.spec,
        hidden_sizes=[400, 400, 400],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    replay_buffer = SACReplayBuffer(env_spec=env.spec, max_size=int(1e6))
    sampler_args = {'agent': policy, 'max_path_length': 150}

    timesteps = 20000000
    batch_size = int(150 * env.num_tasks)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    sac = MTSAC(env=env,
                eval_env_dict=MT10_envs_test,
                env_spec=env.spec,
                policy=policy,
                qf1=qf1,
                qf2=qf2,
                gradient_steps_per_itr=150,
                epoch_cycles=epoch_cycles,
                use_automatic_entropy_tuning=True,
                replay_buffer=replay_buffer,
                min_buffer_size=1500,
                target_update_tau=5e-3,
                discount=0.99,
                buffer_batch_size=1280)
    tu.set_gpu_mode(True)
    sac.to('cuda:0')

    runner.setup(algo=sac,
                 env=env,
                 sampler_cls=SimpleSampler,
                 sampler_args=sampler_args)

    runner.train(n_epochs=epochs, batch_size=batch_size)
def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0):
    """Train MTSAC with MT50 environment.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        use_gpu (bool): Used to enable ussage of GPU in training.
        _gpu (int): The ID of the gpu (used on multi-gpu machines).

    """
    deterministic.set_seed(seed)
    runner = LocalRunner(ctxt)
    task_names = mwb.MT50.get_train_tasks().all_task_names
    train_envs = []
    test_envs = []
    for task_name in task_names:
        train_env = normalize(MetaRLEnv(mwb.MT50.from_task(task_name)),
                              normalize_reward=True)
        test_env = normalize(MetaRLEnv(mwb.MT50.from_task(task_name)))
        train_envs.append(train_env)
        test_envs.append(test_env)
    mt50_train_envs = MultiEnvWrapper(train_envs,
                                      sample_strategy=round_robin_strategy,
                                      mode='vanilla')
    mt50_test_envs = MultiEnvWrapper(test_envs,
                                     sample_strategy=round_robin_strategy,
                                     mode='vanilla')
    policy = TanhGaussianMLPPolicy(
        env_spec=mt50_train_envs.spec,
        hidden_sizes=[400, 400, 400],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

    timesteps = 100000000
    batch_size = int(150 * mt50_train_envs.num_tasks)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=150,
                  max_path_length=250,
                  eval_env=mt50_test_envs,
                  env_spec=mt50_train_envs.spec,
                  num_tasks=10,
                  steps_per_epoch=epoch_cycles,
                  replay_buffer=replay_buffer,
                  min_buffer_size=7500,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=6400)
    set_gpu_mode(use_gpu, _gpu)
    mtsac.to()
    runner.setup(algo=mtsac, env=mt50_train_envs, sampler_cls=LocalSampler)
    runner.train(n_epochs=epochs, batch_size=batch_size)
def mt50_sac_normalize_all(ctxt=None, seed=1):
    """Set up environment and algorithm and run the task."""
    runner = LocalRunner(ctxt)
    envs = MT50.get_train_tasks(sample_all=True)
    test_envs = MT50.get_test_tasks(sample_all=True)
    MT50_envs_by_id = {
        name: MetaRLEnv(
            normalize(env,
                      normalize_reward=True,
                      normalize_obs=True,
                      flatten_obs=False))
        for (name, env) in zip(envs._task_names, envs._task_envs)
    }
    MT50_envs_test = {
        name: MetaRLEnv(normalize(env, normalize_obs=True, flatten_obs=False))
        for (name, env) in zip(test_envs._task_names, test_envs._task_envs)
    }
    env = MTMetaWorldWrapper(MT50_envs_by_id)

    policy = TanhGaussianMLPPolicy2(
        env_spec=env.spec,
        hidden_sizes=[400, 400, 400],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    replay_buffer = SACReplayBuffer(env_spec=env.spec, max_size=int(1e6))
    sampler_args = {'agent': policy, 'max_path_length': 150}

    timesteps = 100000000
    batch_size = int(150 * env.num_tasks)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    sac = MTSAC(env=env,
                eval_env_dict=MT50_envs_test,
                env_spec=env.spec,
                policy=policy,
                qf1=qf1,
                qf2=qf2,
                gradient_steps_per_itr=250,
                epoch_cycles=epoch_cycles,
                use_automatic_entropy_tuning=True,
                replay_buffer=replay_buffer,
                min_buffer_size=7500,
                target_update_tau=5e-3,
                discount=0.99,
                buffer_batch_size=6400)
    tu.set_gpu_mode(True)
    sac.to('cuda:0')

    runner.setup(algo=sac,
                 env=env,
                 sampler_cls=SimpleSampler,
                 sampler_args=sampler_args)

    runner.train(n_epochs=epochs, batch_size=batch_size)
def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None):
    """Train MTSAC with the ML1 pick-place-v1 environment.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).

    """
    deterministic.set_seed(seed)
    runner = LocalRunner(ctxt)
    train_envs = []
    test_envs = []
    env_names = []
    for i in range(50):
        train_env = MetaRLEnv(
            normalize(mwb.ML1.get_train_tasks('pick-place-v1'),
                      normalize_reward=True))
        test_env = pickle.loads(pickle.dumps(train_env))
        env_names.append('pick_place_{}'.format(i))
        train_envs.append(train_env)
        test_envs.append(test_env)
    ml1_train_envs = MultiEnvWrapper(train_envs,
                                     sample_strategy=round_robin_strategy,
                                     env_names=env_names)
    ml1_test_envs = MultiEnvWrapper(test_envs,
                                    sample_strategy=round_robin_strategy,
                                    env_names=env_names)
    policy = TanhGaussianMLPPolicy(
        env_spec=ml1_train_envs.spec,
        hidden_sizes=[400, 400, 400],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

    timesteps = 10000000
    batch_size = int(150 * ml1_train_envs.num_tasks)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=150,
                  max_path_length=150,
                  eval_env=ml1_test_envs,
                  env_spec=ml1_train_envs.spec,
                  num_tasks=50,
                  steps_per_epoch=epoch_cycles,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1500,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=1280)
    if _gpu is not None:
        set_gpu_mode(True, _gpu)
    mtsac.to()
    runner.setup(algo=mtsac, env=ml1_train_envs, sampler_cls=LocalSampler)
    runner.train(n_epochs=epochs, batch_size=batch_size)