Ejemplo n.º 1
0
def get_goal_example_environment_from_variant(variant):
    import gym
    
    if variant['task'] not in [env.id for env  in gym.envs.registry.all()]:
        from multiworld.envs.mujoco import register_goal_example_envs
        register_goal_example_envs()

    return GymAdapter(env=gym.make(variant['task']))
Ejemplo n.º 2
0
def get_goal_example_environment_from_variant(environment_name, gym_adapter=True, eval=False):
    import gym

    if environment_name not in [env.id for env  in gym.envs.registry.all()]:
        from multiworld.envs.mujoco import register_goal_example_envs
        register_goal_example_envs()

    if gym_adapter:
        return GymAdapter(env=gym.make(environment_name))
    else:
        return gym.make(environment_name)
Ejemplo n.º 3
0
def get_goal_example_environment_from_variant(variant):
    import gym

    if variant['task'] not in [env.id for env in gym.envs.registry.all()]:
        if 'Manip' in variant['task']:
            import manip_envs
        else:
            from multiworld.envs.mujoco import register_goal_example_envs
            register_goal_example_envs()
            from metaworld.envs.mujoco import register_rl_with_videos_custom_envs
            register_rl_with_videos_custom_envs()
#            import mj_envs.hand_manipulation_suite

#        from metaworld.envs.mujoco.sawyer_xyz import register_environments; register_environments()
    return GymAdapter(env=gym.make(variant['task']))
Ejemplo n.º 4
0
from distutils.util import strtobool
import json
import os

import softlearning.algorithms.utils as alg_utils
import softlearning.environments.utils as env_utils
from softlearning.misc.utils import datetimestamp

DEFAULT_TASK = 'StateDoorPullEnv-v0'
DEFAULT_ALGORITHM = 'VICE'
AVAILABLE_ALGORITHMS = set(alg_utils.ALGORITHM_CLASSES.keys())

import gym
from multiworld.envs.mujoco import register_goal_example_envs
envs_before = set(env_spec.id for env_spec in gym.envs.registry.all())
register_goal_example_envs()
envs_after = set(env_spec.id for env_spec in gym.envs.registry.all())
goal_example_envs = tuple(sorted(envs_after - envs_before))


def add_ray_init_args(parser):
    def init_help_string(help_string):
        return help_string + " Passed to `ray.init`."

    parser.add_argument(
        '--cpus',
        type=int,
        default=None,
        help=init_help_string("Cpus to allocate to ray process."))
    parser.add_argument(
        '--gpus',
Ejemplo n.º 5
0
def experiment(variant):
    from multiworld.envs.mujoco import register_goal_example_envs
    register_goal_example_envs()

    eval_env = gym.make('Image48SawyerPushForwardEnv-v0')
    expl_env = gym.make('Image48SawyerPushForwardEnv-v0')
    # Hack for now
    eval_env.wrapped_env.transpose = True
    expl_env.wrapped_env.transpose = True

    img_width, img_height = eval_env.image_shape
    num_channels = 3

    action_dim = int(np.prod(eval_env.action_space.shape))
    cnn_params = variant['cnn_params']
    cnn_params.update(
        input_width=img_width,
        input_height=img_height,
        input_channels=num_channels,
        added_fc_input_size=4,
        output_conv_channels=True,
        output_size=None,
    )

    qf_cnn = CNN(**cnn_params)
    qf_obs_processor = nn.Sequential(
        qf_cnn,
        Flatten(),
    )

    qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    qf_kwargs['obs_processor'] = qf_obs_processor
    qf_kwargs['output_size'] = 1
    qf_kwargs['input_size'] = (action_dim + qf_cnn.conv_output_flat_size)
    qf1 = MlpQfWithObsProcessor(**qf_kwargs)
    qf2 = MlpQfWithObsProcessor(**qf_kwargs)

    target_qf_cnn = CNN(**cnn_params)
    target_qf_obs_processor = nn.Sequential(
        target_qf_cnn,
        Flatten(),
    )
    target_qf_kwargs = copy.deepcopy(variant['qf_kwargs'])
    target_qf_kwargs['obs_processor'] = target_qf_obs_processor
    target_qf_kwargs['output_size'] = 1
    target_qf_kwargs['input_size'] = (action_dim +
                                      target_qf_cnn.conv_output_flat_size)
    target_qf1 = MlpQfWithObsProcessor(**target_qf_kwargs)
    target_qf2 = MlpQfWithObsProcessor(**target_qf_kwargs)

    action_dim = int(np.prod(eval_env.action_space.shape))
    policy_cnn = CNN(**cnn_params)
    policy_obs_processor = nn.Sequential(
        policy_cnn,
        Flatten(),
    )
    policy = TanhGaussianPolicyAdapter(policy_obs_processor,
                                       policy_cnn.conv_output_flat_size,
                                       action_dim, **variant['policy_kwargs'])

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env, eval_policy, **variant['eval_path_collector_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    if variant['collection_mode'] == 'batch':
        expl_path_collector = MdpPathCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchBatchRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    elif variant['collection_mode'] == 'online':
        expl_path_collector = MdpStepCollector(
            expl_env, policy, **variant['expl_path_collector_kwargs'])
        algorithm = TorchOnlineRLAlgorithm(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()