def test_onehots_consistent_with_task_sampler():
    # Import, construct environments here to avoid using up too much
    # resources if this test isn't run.
    # pylint: disable=import-outside-toplevel
    import metaworld
    mt10 = metaworld.MT10()
    env = MetaWorldSetTaskEnv(mt10, 'train', add_env_onehot=True)
    policy = RandomPolicy(env.action_space)
    workers = WorkerFactory(seed=100, max_episode_length=1, n_workers=10)
    sampler1 = LocalSampler.from_worker_factory(workers, policy, env)
    env_ups = [
        SetTaskUpdate(MetaWorldSetTaskEnv, task, None)
        for task in env.sample_tasks(10)
    ]
    samples1 = sampler1.obtain_exact_episodes(1, policy, env_ups)
    task_sampler = MetaWorldTaskSampler(mt10, 'train', add_env_onehot=True)
    env_ups = task_sampler.sample(10)
    sampler2 = LocalSampler.from_worker_factory(workers, policy, env_ups)
    samples2 = sampler2.obtain_exact_episodes(1, policy, env_ups)
    name_to_obs1 = {}
    for obs1, name1 in zip(samples1.observations,
                           samples1.env_infos['task_name']):
        name_to_obs1[name1] = obs1
    for obs2, name2 in zip(samples2.observations,
                           samples2.env_infos['task_name']):
        assert (name_to_obs1[name2][-10:] == obs2[-10:]).all()
Ejemplo n.º 2
0
def test_identical_environments():
    def helper(env, env_2):
        for i in range(len(env.train_tasks)):
            rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec']
            rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec']
            np.testing.assert_equal(rand_vec_1, rand_vec_2)

    def helper_neq(env, env_2):
        for i in range(len(env.train_tasks)):
            rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec']
            rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec']
            assert not (rand_vec_1 == rand_vec_2).all()

    #testing MT1
    mt1_1 = metaworld.MT1('sweep-into-v2', seed=10)
    mt1_2 = metaworld.MT1('sweep-into-v2', seed=10)
    helper(mt1_1, mt1_2)

    #testing ML1
    ml1_1 = metaworld.ML1('sweep-into-v2', seed=10)
    ml1_2 = metaworld.ML1('sweep-into-v2', seed=10)
    helper(ml1_1, ml1_2)

    #testing MT10
    mt10_1 = metaworld.MT10(seed=10)
    mt10_2 = metaworld.MT10(seed=10)
    helper(mt10_1, mt10_2)

    # testing ML10
    ml10_1 = metaworld.ML10(seed=10)
    ml10_2 = metaworld.ML10(seed=10)
    helper(ml10_1, ml10_2)

    #testing ML45
    ml45_1 = metaworld.ML45(seed=10)
    ml45_2 = metaworld.ML45(seed=10)
    helper(ml45_1, ml45_2)

    #testing MT50
    mt50_1 = metaworld.MT50(seed=10)
    mt50_2 = metaworld.MT50(seed=10)
    helper(mt50_1, mt50_2)

    # test that 2 benchmarks with different seeds have different goals
    mt50_3 = metaworld.MT50(seed=50)
    helper_neq(mt50_1, mt50_3)
Ejemplo n.º 3
0
def mtppo_metaworld_mt50(ctxt, seed, epochs, batch_size, n_workers, n_tasks):
    """Set up environment and algorithm and run the task.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        batch_size (int): Number of environment steps in one batch.
        n_workers (int): The number of workers the sampler should use.
        n_tasks (int): Number of tasks to use. Should be a multiple of 50.

    """
    set_seed(seed)
    mt10 = metaworld.MT10()
    train_task_sampler = MetaWorldTaskSampler(mt10,
                                              'train',
                                              lambda env, _: normalize(env),
                                              add_env_onehot=True)
    assert n_tasks % 50 == 0
    assert n_tasks <= 2500
    envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)]
    env = MultiEnvWrapper(envs,
                          sample_strategy=round_robin_strategy,
                          mode='vanilla')

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        hidden_nonlinearity=torch.tanh,
        output_nonlinearity=None,
    )

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    sampler = RaySampler(agents=policy,
                         envs=env,
                         max_episode_length=env.spec.max_episode_length,
                         n_workers=n_workers)

    algo = PPO(env_spec=env.spec,
               policy=policy,
               value_function=value_function,
               sampler=sampler,
               discount=0.99,
               gae_lambda=0.95,
               center_adv=True,
               lr_clip_range=0.2)

    trainer = Trainer(ctxt)
    trainer.setup(algo, env)
    trainer.train(n_epochs=epochs, batch_size=batch_size)
Ejemplo n.º 4
0
def make_env(env_name, seed=10):
    if env_name.startswith("dm"):
        _, domain, task = env_name.split('.')
        env = dmc2gym.make(domain_name=domain,
                           task_name=task,
                           seed=seed,
                           visualize_reward=True)
        assert env.action_space.low.min() >= -1
        assert env.action_space.high.max() <= 1
    elif env_name.startswith('mw'):
        _, task = env_name.split('.')
        mt10 = metaworld.MT10()
        env = mt10.train_classes[task]()
        tasks = [t for t in mt10.train_tasks if t.env_name == task]
        env.set_task(tasks[0])
        if env.max_path_length <= 150:
            raise RuntimeError
        env._max_episode_steps = 150  # env.max_path_length
    else:
        env = gym.make(env_name)
    return env
Ejemplo n.º 5
0
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps):
    """Train MTSAC with MT10 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).
        n_tasks (int): Number of tasks to use. Should be a multiple of 10.
        timesteps (int): Number of timesteps to run.

    """
    deterministic.set_seed(seed)
    trainer = Trainer(ctxt)
    mt10 = metaworld.MT10()  # pylint: disable=no-member
    mt10_test = metaworld.MT10()  # pylint: disable=no-member

    # pylint: disable=missing-return-doc, missing-return-type-doc
    def wrap(env, _):
        return normalize(env, normalize_reward=True)

    train_task_sampler = MetaWorldTaskSampler(mt10,
                                              'train',
                                              wrap,
                                              add_env_onehot=True)
    test_task_sampler = MetaWorldTaskSampler(mt10_test,
                                             'train',
                                             add_env_onehot=True)
    assert n_tasks % 10 == 0
    assert n_tasks <= 500
    mt10_train_envs = train_task_sampler.sample(n_tasks)
    env = mt10_train_envs[0]()
    mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)]

    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[400, 400, 400],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
    meta_batch_size = 10

    sampler = LocalSampler(
        agents=policy,
        envs=mt10_train_envs,
        max_episode_length=env.spec.max_episode_length,
        # 1 sampler worker for each environment
        n_workers=meta_batch_size,
        worker_class=FragmentWorker,
        # increasing n_envs increases the vectorization of a sampler worker
        # which improves runtime performance, but you will need to adjust this
        # depending on your memory constraints. For reference, each worker by
        # default uses n_envs=8. Each environment is approximately ~50mb large
        # so creating 50 envs with 8 copies comes out to 20gb of memory. Many
        # users want to be able to run multiple seeds on 1 machine, so I have
        # reduced this to n_envs = 2 for 2 copies in the meantime.
        worker_args=dict(n_envs=2))

    batch_size = int(env.spec.max_episode_length * meta_batch_size)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  sampler=sampler,
                  gradient_steps_per_itr=env.spec.max_episode_length,
                  eval_env=mt10_test_envs,
                  env_spec=env.spec,
                  num_tasks=10,
                  steps_per_epoch=epoch_cycles,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1500,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=1280)
    if _gpu is not None:
        set_gpu_mode(True, _gpu)
    mtsac.to()
    trainer.setup(algo=mtsac, env=mt10_train_envs)
    trainer.train(n_epochs=epochs, batch_size=batch_size)
Ejemplo n.º 6
0
    def __init__(
        self,
        experiment_name,
        use_gpu,
        trainer_args
    ):
        """Train MTSAC with metaworld_experiments environment.
        Args:
            experiment_name: expeirment name to be used for logging and checkpointing
            use_wandb: boolean, defines whether or not to log to wandb
            use_gpu: boolean, defines whether or not to use to GPU for training
            trainer_args: named tuple with args given by config
        """

        # Define log and checkpoint dir
        self.checkpoint_dir = os.path.join(
            trainer_args.log_dir,
            f"{experiment_name}-{trainer_args.project_id}"
        )
        print(f"Checkpoint dir: {self.checkpoint_dir}")
        self.state_path = os.path.join(self.checkpoint_dir, "experiment_state.p")
        self.env_state_path = os.path.join(self.checkpoint_dir, "env_state.p")
        self.config_path = os.path.join(self.checkpoint_dir, "config.json")
        self.experiment_name = experiment_name

        # Only define viz_save_path if required to save visualizations local
        self.viz_save_path = None
        if trainer_args.save_visualizations_local:
            self.viz_save_path = os.path.join(self.checkpoint_dir, "viz")

        # Check if loading from existing experiment
        self.loading_from_existing = os.path.exists(self.checkpoint_dir)
        os.makedirs(self.checkpoint_dir, exist_ok=True)

        # Save arguments for later retrieval
        self.init_config(trainer_args)

        num_tasks = trainer_args.num_tasks

        # TODO: do we have to fix which GPU to use? run distributed across multiGPUs
        if use_gpu:
            set_gpu_mode(True, 0)

        if trainer_args.seed is not None:
            deterministic.set_seed(trainer_args.seed)

        # Note: different classes whether it uses 10 or 50 tasks. Why?
        mt_env = (
            metaworld.MT10(seed=trainer_args.env_seed) if num_tasks <= 10
            else metaworld.MT50(seed=trainer_args.env_seed)
        )

        train_task_sampler = MetaWorldTaskSampler(
            mt_env, "train", add_env_onehot=True
        )

        # TODO: add some clarifying comments of why these asserts are required
        assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10"
        assert num_tasks <= 500, "Number of tasks should be less or equal 500"

        # TODO: do we have guarantees that in case seed is set, the tasks being sampled
        # are the same?
        mt_train_envs = train_task_sampler.sample(num_tasks)
        env = mt_train_envs[0]()

        if trainer_args.params_seed is not None:
            torch.manual_seed(trainer_args.params_seed)

        policy = create_policy_net(env_spec=env.spec, net_params=trainer_args)
        qf1 = create_qf_net(env_spec=env.spec, net_params=trainer_args)
        qf2 = create_qf_net(env_spec=env.spec, net_params=trainer_args)

        if trainer_args.params_seed is not None:
            calculate_mean_param("policy", policy)
            calculate_mean_param("qf1", qf1)
            calculate_mean_param("qf2", qf2)

        if trainer_args.override_weight_initialization:
            logging.warn("Overriding dendritic layer weight initialization")
            self.override_weight_initialization([policy, qf1, qf2])

        replay_buffer = PathBuffer(
            capacity_in_transitions=trainer_args.num_buffer_transitions
        )
        max_episode_length = env.spec.max_episode_length
        self.env_steps_per_epoch = int(max_episode_length * num_tasks)
        self.num_epochs = trainer_args.timesteps // self.env_steps_per_epoch

        sampler = RaySampler(
            agent=policy,
            envs=mt_train_envs,
            max_episode_length=max_episode_length,
            cpus_per_worker=trainer_args.cpus_per_worker,
            gpus_per_worker=trainer_args.gpus_per_worker,
            workers_per_env=trainer_args.workers_per_env,
            seed=trainer_args.seed,
        )

        self._algo = CustomMTSAC(
            env_spec=env.spec,
            policy=policy,
            qf1=qf1,
            qf2=qf2,
            replay_buffer=replay_buffer,
            sampler=sampler,
            train_task_sampler=train_task_sampler,
            gradient_steps_per_itr=int(
                max_episode_length * trainer_args.num_grad_steps_scale
            ),
            task_update_frequency=trainer_args.task_update_frequency,
            num_tasks=num_tasks,
            min_buffer_size=max_episode_length * num_tasks,
            target_update_tau=trainer_args.target_update_tau,
            discount=trainer_args.discount,
            buffer_batch_size=trainer_args.buffer_batch_size,
            policy_lr=trainer_args.policy_lr,
            qf_lr=trainer_args.qf_lr,
            reward_scale=trainer_args.reward_scale,
            num_evaluation_episodes=trainer_args.eval_episodes,
            fp16=trainer_args.fp16 if use_gpu else False,
            log_per_task=trainer_args.log_per_task,
            share_train_eval_env=trainer_args.share_train_eval_env
        )

        # Override with loaded networks if existing experiment
        self.current_epoch = 0
        if self.loading_from_existing:
            self.load_experiment_state()

        # Move all networks within the model on device
        self._algo.to()
Ejemplo n.º 7
0
def te_ppo_mt10(ctxt, seed, n_epochs, batch_size_per_task, n_tasks):
    """Train Task Embedding PPO with PointEnv.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~Trainer` to create the :class:`~Snapshotter`.
        seed (int): Used to seed the random number generator to produce
            determinism.
        n_epochs (int): Total number of epochs for training.
        batch_size_per_task (int): Batch size of samples for each task.
        n_tasks (int): Number of tasks to use. Should be a multiple of 10.

    """
    set_seed(seed)
    mt10 = metaworld.MT10()
    train_task_sampler = MetaWorldTaskSampler(mt10,
                                              'train',
                                              lambda env, _: normalize(env),
                                              add_env_onehot=False)
    assert n_tasks % 10 == 0
    assert n_tasks <= 500
    envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)]
    env = MultiEnvWrapper(envs,
                          sample_strategy=round_robin_strategy,
                          mode='vanilla')

    latent_length = 4
    inference_window = 6
    batch_size = batch_size_per_task * len(envs)
    policy_ent_coeff = 2e-2
    encoder_ent_coeff = 2e-4
    inference_ce_coeff = 5e-2
    embedding_init_std = 0.1
    embedding_max_std = 0.2
    embedding_min_std = 1e-6
    policy_init_std = 1.0
    policy_max_std = None
    policy_min_std = None

    with TFTrainer(snapshot_config=ctxt) as trainer:

        task_embed_spec = TEPPO.get_encoder_spec(env.task_space,
                                                 latent_dim=latent_length)

        task_encoder = GaussianMLPEncoder(
            name='embedding',
            embedding_spec=task_embed_spec,
            hidden_sizes=(20, 20),
            std_share_network=True,
            init_std=embedding_init_std,
            max_std=embedding_max_std,
            output_nonlinearity=tf.nn.tanh,
            min_std=embedding_min_std,
        )

        traj_embed_spec = TEPPO.get_infer_spec(
            env.spec,
            latent_dim=latent_length,
            inference_window_size=inference_window)

        inference = GaussianMLPEncoder(
            name='inference',
            embedding_spec=traj_embed_spec,
            hidden_sizes=(20, 10),
            std_share_network=True,
            init_std=2.0,
            output_nonlinearity=tf.nn.tanh,
            min_std=embedding_min_std,
        )

        policy = GaussianMLPTaskEmbeddingPolicy(
            name='policy',
            env_spec=env.spec,
            encoder=task_encoder,
            hidden_sizes=(32, 16),
            std_share_network=True,
            max_std=policy_max_std,
            init_std=policy_init_std,
            min_std=policy_min_std,
        )

        baseline = LinearMultiFeatureBaseline(
            env_spec=env.spec, features=['observations', 'tasks', 'latents'])

        sampler = LocalSampler(agents=policy,
                               envs=env,
                               max_episode_length=env.spec.max_episode_length,
                               is_tf_worker=True,
                               worker_class=TaskEmbeddingWorker)

        algo = TEPPO(env_spec=env.spec,
                     policy=policy,
                     baseline=baseline,
                     sampler=sampler,
                     inference=inference,
                     discount=0.99,
                     lr_clip_range=0.2,
                     policy_ent_coeff=policy_ent_coeff,
                     encoder_ent_coeff=encoder_ent_coeff,
                     inference_ce_coeff=inference_ce_coeff,
                     use_softplus_entropy=True,
                     optimizer_args=dict(
                         batch_size=32,
                         max_optimization_epochs=10,
                         learning_rate=1e-3,
                     ),
                     inference_optimizer_args=dict(
                         batch_size=32,
                         max_optimization_epochs=10,
                     ),
                     center_adv=True,
                     stop_ce_gradient=True)

        trainer.setup(algo, env)
        trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
Ejemplo n.º 8
0
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps):
    """Train MTSAC with MT10 environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).
        n_tasks (int): Number of tasks to use. Should be a multiple of 10.
        timesteps (int): Number of timesteps to run.
    """
    deterministic.set_seed(seed)
    trainer = Trainer(ctxt)
    mt10 = metaworld.MT10()  # pylint: disable=no-member
    mt10_test = metaworld.MT10()  # pylint: disable=no-member

    # pylint: disable=missing-return-doc, missing-return-type-doc
    def wrap(env, _):
        return normalize(env, normalize_reward=True)

    train_task_sampler = MetaWorldTaskSampler(mt10,
                                              'train',
                                              wrap,
                                              add_env_onehot=True)
    test_task_sampler = MetaWorldTaskSampler(mt10_test,
                                             'train',
                                             add_env_onehot=True)
    assert n_tasks % 10 == 0
    assert n_tasks <= 500
    mt10_train_envs = train_task_sampler.sample(n_tasks)
    env = mt10_train_envs[0]()
    mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)]

    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[400, 400, 400],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
    meta_batch_size = 10

    # ray.init(local_mode=True, log_to_driver=False, ignore_reinit_error=True)
    sampler = SingleVecWorkSampler(
        agents=policy,
        envs=mt10_train_envs,
        n_workers=meta_batch_size,
        max_episode_length=env.spec.max_episode_length,
        # 1 sampler worker for each environment
        worker_class=FragmentWorker,
        # increasing n_envs increases the vectorization of a sampler worker
        # which improves runtime performance, but you will need to adjust this
        # depending on your memory constraints. For reference, each worker by
        # default uses n_envs=8. Each environment is approximately ~50mb large
        # so creating 50 envs with 8 copies comes out to 20gb of memory. Many
        # users want to be able to run multiple seeds on 1 machine, so I have
        # reduced this to n_envs = 2 for 2 copies in the meantime.
        worker_args=dict(n_envs=10))

    # one episode for each task between gradient steps
    batch_size = int(env.spec.max_episode_length * meta_batch_size)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  sampler=sampler,
                  gradient_steps_per_itr=env.spec.max_episode_length,
                  eval_env=mt10_test_envs,
                  env_spec=env.spec,
                  num_tasks=10,
                  steps_per_epoch=epoch_cycles,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1500,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=1280)
    if _gpu is not None:
        set_gpu_mode(True, _gpu)
    trainer.setup(algo=mtsac, env=mt10_train_envs)
    import time
    s = time.time()
    mtsac.to()
    print(time.time() - s)

    s = time.time()
    # trainer.step_episode = trainer.obtain_samples(0, 1500, None, None)
    trainer.step_episode = trainer.obtain_samples(0, 2000, None, None)
    print((time.time() - s))
    a = 2

    from garage import StepType
    path_returns = []
    for path in trainer.step_episode:
        mtsac.replay_buffer.add_path(
            dict(observation=path['observations'],
                 action=path['actions'],
                 reward=path['rewards'].reshape(-1, 1),
                 next_observation=path['next_observations'],
                 terminal=np.array([
                     step_type == StepType.TERMINAL
                     for step_type in path['step_types']
                 ]).reshape(-1, 1)))
        path_returns.append(sum(path['rewards']))

    s = time.time()
    for _ in range(10):
        trainer._algo.train_once()
    print((time.time() - s) / 10)
Ejemplo n.º 9
0
    def __init__(
        self,
        benchmark_name: str,
        save_memory: bool = False,
        add_observability: bool = False,
    ) -> None:
        """ Init function for environment wrapper. """

        # We import here so that we avoid importing metaworld if possible, since it is
        # dependent on mujoco.
        import metaworld
        from metaworld import Task

        # Set config for each benchmark.
        if benchmark_name.startswith("MT1_"):
            env_name = benchmark_name[4:]
            benchmark = metaworld.MT1(env_name)
            env_dict = {env_name: benchmark.train_classes[env_name]}
            tasks = benchmark.train_tasks
            resample_tasks = False
            self.augment_obs = False

        elif benchmark_name == "MT10":
            benchmark = metaworld.MT10()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = False
            self.augment_obs = True

        elif benchmark_name == "MT50":
            benchmark = metaworld.MT50()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = False
            self.augment_obs = True

        elif benchmark_name.startswith("ML1_train_"):
            env_name = benchmark_name[10:]
            benchmark = metaworld.ML1(env_name)
            env_dict = {env_name: benchmark.train_classes[env_name]}
            tasks = benchmark.train_tasks
            resample_tasks = True
            self.augment_obs = False

        elif benchmark_name == "ML10_train":
            benchmark = metaworld.ML10()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = True
            self.augment_obs = True

        elif benchmark_name == "ML45_train":
            benchmark = metaworld.ML45()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = True
            self.augment_obs = True

        elif benchmark_name.startswith("ML1_test_"):
            env_name = benchmark_name[9:]
            benchmark = metaworld.ML1(env_name)
            env_dict = {env_name: benchmark.test_classes[env_name]}
            tasks = benchmark.test_tasks
            resample_tasks = True
            self.augment_obs = False

        elif benchmark_name == "ML10_test":
            benchmark = metaworld.ML10()
            env_dict = benchmark.test_classes
            tasks = benchmark.test_tasks
            resample_tasks = True
            self.augment_obs = True

        elif benchmark_name == "ML45_test":
            benchmark = metaworld.ML45()
            env_dict = benchmark.test_classes
            tasks = benchmark.test_tasks
            resample_tasks = True
            self.augment_obs = True

        else:
            raise NotImplementedError

        # Construct list of tasks for each environment, adding observability to tasks if
        # necessary.
        env_tasks = {}
        for task in tasks:
            if add_observability:
                task_data = dict(pickle.loads(task.data))
                task_data["partially_observable"] = False
                task = Task(env_name=task.env_name,
                            data=pickle.dumps(task_data))

            if task.env_name in env_tasks:
                if resample_tasks:
                    env_tasks[task.env_name].append(task)
            else:
                env_tasks[task.env_name] = [task]

        # Construct list of environment classes or class instances.
        self.save_memory = save_memory
        if self.save_memory:
            self.envs_info = [{
                "env_name": env_name,
                "env_cls": env_cls,
                "tasks": env_tasks[env_name]
            } for (env_name, env_cls) in env_dict.items()]
        else:
            self.envs_info = [{
                "env_name": env_name,
                "env": env_cls(),
                "tasks": env_tasks[env_name]
            } for (env_name, env_cls) in env_dict.items()]

        self.num_tasks = len(self.envs_info)

        # Sample environment.
        self._sample_environment()
Ejemplo n.º 10
0
def mtppo_metaworld_mt10(ctxt, experiment_name, config_pth, seed, n_workers, n_tasks, use_wandb,
                         wandb_username, use_gpu):
    """Set up environment and algorithm and run the task.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        batch_size (int): Number of environment steps in one batch.
        n_workers (int): The number of workers the sampler should use.
        n_tasks (int): Number of tasks to use. Should be a multiple of 10.
    """
    params = get_params(config_pth)
    set_seed(seed)
    mt10 = metaworld.MT10()
    train_task_sampler = MetaWorldTaskSampler(mt10,
                                              'train',
                                              lambda env, _: normalize(env),
                                              add_env_onehot=True)

    if use_wandb == 'True':
        use_wandb = True
        wandb.init(
            name=experiment_name,
            entity=wandb_username,
            project="mt10",
            group="Baselines{}".format("mt10"),
            reinit=True,
            config=params,
        )
    else:
        use_wandb = False

    assert n_tasks % 10 == 0
    assert n_tasks <= 500

    envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)]
    env = envs[0]

    policy = create_policy_net(env_spec=env.spec,
                               net_params=params["net"]
                               )
    value_function = create_vf_net(env_spec=env.spec,
                                   net_params=params["net"]
                                   )

    sampler = RaySampler(agents=policy,
                         envs=env,
                         max_episode_length=env.spec.max_episode_length,
                         n_workers=n_workers,
                         worker_class=DefaultWorker)

    gpu_training = True if use_gpu else False

    algo = CustomMTPPO(env_spec=env.spec,
                       policy=policy,
                       value_function=value_function,
                       sampler=sampler,
                       train_task_sampler=train_task_sampler,
                       num_tasks=n_tasks,
                       task_update_frequency=params["training"]["task_update_frequency"],
                       num_eval_eps=params["general_setting"]["eval_episodes"],
                       policy_lr=params["training"]["policy_lr"],
                       vf_lr=params["training"]["vf_lr"],
                       ppo_eps=params["training"]["ppo_eps"],
                       minibatch_size=params["training"]["minibatch_size"],
                       ppo_epochs=params["training"]["ppo_epochs"],
                       num_train_per_epoch=params["training"]["num_train_per_epoch"],
                       discount=params["general_setting"]["discount"],
                       gae_lambda=params["training"]["gae_lambda"],
                       center_adv=False,
                       wandb_logging=use_wandb,
                       eval_freq=params["general_setting"]["eval_freq"],
                       stop_entropy_gradient=True,
                       entropy_method='max',
                       gpu_training=gpu_training
                       )

    trainer = Trainer(ctxt)
    trainer.setup(algo, env)
    trainer.train(n_epochs=params["training"]["epochs"],
                  batch_size=params["training"]["batch_episodes_per_task"],
                  plot=False)
Ejemplo n.º 11
0
def mtsac_metaworld_mt10(ctxt=None,
                         *,
                         experiment_name,
                         config_pth,
                         seed,
                         use_wandb,
                         gpu):
    """Train MTSAC with MT10 environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).
        timesteps (int): Number of timesteps to run.
    """
    print(f"Initiation took {time()-t0:.2f} secs")

    device = torch.device("cuda") if gpu else torch.device("cpu")
    print(f"Using GPU: {gpu}, Device: {device}")

    # maybe overring other things - this is required, why?
    if gpu:
        set_gpu_mode(True)
    else:
        set_gpu_mode(False)

    # Get experiment parameters (e.g. hyperparameters) and save the json file
    params = get_params(config_pth)

    with open(ctxt.snapshot_dir + "/params.json", "w") as json_file:
        json.dump(params, json_file)

    if use_wandb == "True":
        use_wandb = True
        wandb.init(
            name=experiment_name,
            project="mt10_debug",
            group="Baselines{}".format("mt10"),
            reinit=True,
            config=params,
        )
    else:
        use_wandb = False

    num_tasks = params["net"]["num_tasks"]
    timesteps = 15000000
    deterministic.set_seed(seed)
    trainer = Trainer(ctxt)

    # Note: different classes whether it uses 10 or 50 tasks. Why?
    if num_tasks <= 10:
        mt_env = metaworld.MT10()
    else:
        mt_env = metaworld.MT50()

    train_task_sampler = MetaWorldTaskSampler(mt_env,
                                              "train",
                                              add_env_onehot=True)

    assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10"
    assert num_tasks <= 500, "Number of tasks should be less or equal 500"
    mt_train_envs = train_task_sampler.sample(num_tasks)
    env = mt_train_envs[0]()

    params["net"]["policy_min_std"] = np.exp(
        params["net"]["policy_min_log_std"])
    params["net"]["policy_max_std"] = np.exp(
        params["net"]["policy_max_log_std"])

    policy = create_policy_net(env_spec=env.spec, net_params=params["net"])
    print("Created policy")

    qf1 = create_qf_net(env_spec=env.spec, net_params=params["net"])
    qf2 = create_qf_net(env_spec=env.spec, net_params=params["net"])
    print("Created value functions")

    replay_buffer = PathBuffer(capacity_in_transitions=int(
        params["general_setting"]["num_buffer_transitions"]))
    max_episode_length = env.spec.max_episode_length
    # Note: are the episode length the same among all tasks?

    sampler = RaySampler(
        agents=policy,
        envs=mt_train_envs,
        max_episode_length=max_episode_length,
        cpus_per_worker=params["sampler"]["cpus_per_worker"],
        gpus_per_worker=params["sampler"]["gpus_per_worker"],
        seed=None,  # set to get_seed() to make it deterministic
    )

    # will probably still need the sampler
    test_sampler = sampler
    # test_sampler = RaySampler(
    #     agents=policy,
    #     envs=mt_train_envs,
    #     max_episode_length=max_episode_length,
    #     # 1 sampler worker for each environment
    #     n_workers=num_tasks,
    #     worker_class=EvalWorker
    # )

    # Note:  difference between sampler and test sampler is only the worker
    # difference is one line in EvalWorker, uses average: a = agent_info["mean"]
    # can we create a unified worker that cointais both rules?

    # Number of transitions before a set of gradient updates
    # Note: should we use avg episode length, if they are not same for all tasks?
    batch_size = int(max_episode_length * num_tasks)

    # TODO: this whole block seems unnecessary, it is not doing anything.
    # Number of times policy is evaluated (also the # of epochs)
    num_evaluation_points = timesteps // batch_size
    epochs = timesteps // batch_size
    # number of times new batch of samples + gradient updates are done per epoch
    epoch_cycles = epochs // num_evaluation_points  # this will always be equal to 1
    epochs = epochs // epoch_cycles

    mtsac = CustomMTSAC(
        env_spec=env.spec,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        replay_buffer=replay_buffer,
        sampler=sampler,
        train_task_sampler=train_task_sampler,
        test_sampler=test_sampler,
        gradient_steps_per_itr=1,
        num_tasks=num_tasks,
        steps_per_epoch=epoch_cycles,
        min_buffer_size=max_episode_length * num_tasks,
        target_update_tau=params["training"]["target_update_tau"],
        discount=params["general_setting"]["discount"],
        buffer_batch_size=params["training"]["buffer_batch_size"],
        policy_lr=params["training"]["policy_lr"],
        qf_lr=params["training"]["qf_lr"],
        reward_scale=params["training"]["reward_scale"],
        num_evaluation_episodes=params["general_setting"]["eval_episodes"],
        task_update_frequency=params["training"]["task_update_frequency"],
        wandb_logging=use_wandb,
        evaluation_frequency=params["general_setting"]["evaluation_frequency"],
    )

    print("Created algo")

    mtsac.to(device=device)
    print("Moved networks to device")

    trainer.setup(algo=mtsac, env=mt_train_envs)
    print("Setup trainer")

    trainer.train(n_epochs=epochs, batch_size=batch_size)
Ejemplo n.º 12
0
def mtsac_metaworld_mt10(ctxt=None,
                         *,
                         experiment_name,
                         config_pth,
                         seed,
                         timesteps,
                         use_wandb,
                         wandb_project_name,
                         gpu):
    """Train MTSAC with metaworld_experiments environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).
        timesteps (int): Number of timesteps to run.
    """
    print(f"Initiation took {time()-t0:.2f} secs")

    # Get experiment parameters (e.g. hyperparameters) and save the json file
    params = get_params(config_pth)

    with open(ctxt.snapshot_dir + "/params.json", "w") as json_file:
        json.dump(params, json_file)

    if use_wandb == "True":
        use_wandb = True
        wandb.init(
            name=experiment_name,
            project=wandb_project_name,
            group="Baselines{}".format("mt10"),
            reinit=True,
            config=params,
        )
    else:
        use_wandb = False

    num_tasks = 10
    timesteps = timesteps
    deterministic.set_seed(seed)
    trainer = CustomTrainer(ctxt)
    mt10 = metaworld.MT10()

    train_task_sampler = MetaWorldTaskSampler(mt10,
                                              "train",
                                              add_env_onehot=True)

    assert num_tasks % 10 == 0, "Number of tasks have to divisible by 10"
    assert num_tasks <= 500, "Number of tasks should be less or equal 500"
    mt10_train_envs = train_task_sampler.sample(num_tasks)
    env = mt10_train_envs[0]()

    params["net"]["policy_min_std"] = np.exp(
        params["net"]["policy_min_log_std"])
    params["net"]["policy_max_std"] = np.exp(
        params["net"]["policy_max_log_std"])

    policy = create_policy_net(env_spec=env.spec, net_params=params["net"])
    qf1 = create_qf_net(env_spec=env.spec, net_params=params["net"])
    qf2 = create_qf_net(env_spec=env.spec, net_params=params["net"])

    replay_buffer = PathBuffer(capacity_in_transitions=int(
        params["general_setting"]["num_buffer_transitions"]))
    max_episode_length = env.spec.max_episode_length
    # Note: are the episode length the same among all tasks?

    sampler = RaySampler(
        agents=policy,
        envs=mt10_train_envs,
        max_episode_length=max_episode_length,
        # 1 sampler worker for each environment
        n_workers=num_tasks,
        worker_class=DefaultWorker)

    test_sampler = RaySampler(
        agents=policy,
        envs=mt10_train_envs,
        max_episode_length=max_episode_length,
        # 1 sampler worker for each environment
        n_workers=num_tasks,
        worker_class=EvalWorker)

    # Note:  difference between sampler and test sampler is only the worker
    # difference is one line in EvalWorker, uses average: a = agent_info['mean']
    # can we create a unified worker that cointais both rules?

    # Number of transitions before a set of gradient updates
    steps_between_updates = int(max_episode_length * num_tasks)

    # epoch: 1 cycle of data collection + gradient updates
    epochs = timesteps // steps_between_updates

    mtsac = CustomMTSAC(
        env_spec=env.spec,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        replay_buffer=replay_buffer,
        sampler=sampler,
        train_task_sampler=train_task_sampler,
        test_sampler=test_sampler,
        gradient_steps_per_itr=int(max_episode_length *
                                   params["training"]["num_grad_steps_scale"]),
        num_tasks=num_tasks,
        min_buffer_size=max_episode_length * num_tasks,
        target_update_tau=params["training"]["target_update_tau"],
        discount=params["general_setting"]["discount"],
        buffer_batch_size=params["training"]["buffer_batch_size"],
        policy_lr=params["training"]["policy_lr"],
        qf_lr=params["training"]["qf_lr"],
        reward_scale=params["training"]["reward_scale"],
        num_evaluation_episodes=params["general_setting"]["eval_episodes"],
        task_update_frequency=params["training"]["task_update_frequency"],
        wandb_logging=use_wandb,
        evaluation_frequency=params["general_setting"]["evaluation_frequency"])

    if gpu is not None:
        set_gpu_mode(True, gpu)

    mtsac.to()
    trainer.setup(algo=mtsac, env=mt10_train_envs)
    trainer.train(n_epochs=epochs, batch_size=steps_between_updates)
Ejemplo n.º 13
0
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps):
    """Train MTSAC with MT10 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).
        n_tasks (int): Number of tasks to use. Should be a multiple of 10.
        timesteps (int): Number of timesteps to run.

    """
    deterministic.set_seed(seed)
    trainer = Trainer(ctxt)
    mt10 = metaworld.MT10()
    mt10_test = metaworld.MT10()

    # pylint: disable=missing-return-doc, missing-return-type-doc
    def wrap(env, _):
        return normalize(env)

    train_task_sampler = MetaWorldTaskSampler(mt10,
                                              'train',
                                              wrap,
                                              add_env_onehot=True)
    test_task_sampler = MetaWorldTaskSampler(mt10_test,
                                             'train',
                                             wrap,
                                             add_env_onehot=True)
    assert n_tasks % 10 == 0
    assert n_tasks <= 500
    mt10_train_envs = train_task_sampler.sample(n_tasks)
    env = mt10_train_envs[0]()
    mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)]

    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[400, 400, 400],
        hidden_nonlinearity=nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[400, 400, 400],
                                 hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
    meta_batch_size = 10

    batch_size = int(env.spec.max_episode_length * meta_batch_size)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=150,
                  eval_env=mt10_test_envs,
                  env_spec=env.spec,
                  num_tasks=10,
                  steps_per_epoch=epoch_cycles,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1500,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=1280)
    if _gpu is not None:
        set_gpu_mode(True, _gpu)
    mtsac.to()
    trainer.setup(algo=mtsac,
                  env=mt10_train_envs,
                  sampler_cls=LocalSampler,
                  n_workers=meta_batch_size)
    trainer.train(n_epochs=epochs, batch_size=batch_size)