Beispiel #1
0
def experiment(variant):
    env = NormalizedBoxEnv(variant['env_class'](**variant['env_kwargs']))
    if variant['multitask']:
        env = MultitaskToFlatEnv(env)

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #2
0
def experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    env = NormalizedBoxEnv(env, **variant['normalize_kwargs'])
    if variant['multitask']:
        env = MultitaskToFlatEnv(env)
    es = OUStrategy(action_space=env.action_space, **variant['ou_kwargs'])
    obs_dim = int(env.observation_space.flat_dim)
    action_dim = int(env.action_space.flat_dim)
    obs_normalizer = TorchFixedNormalizer(obs_dim)
    action_normalizer = TorchFixedNormalizer(action_dim)
    qf = MlpQf(input_size=obs_dim + action_dim,
               output_size=1,
               obs_normalizer=obs_normalizer,
               action_normalizer=action_normalizer,
               **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           obs_normalizer=obs_normalizer,
                           **variant['policy_kwargs'])
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf,
                     policy,
                     exploration_policy,
                     obs_normalizer=obs_normalizer,
                     action_normalizer=action_normalizer,
                     **variant['algo_kwargs'])
    algorithm.train()
def experiment(variant):
    if variant['multitask']:
        env = MultitaskFullVAEPoint2DEnv(
            **variant['env_kwargs'])  # used point2d-conv-sweep/run1/id4
        env = MultitaskToFlatEnv(env)
    # else:
    # env = Pusher2DEnv(**variant['env_kwargs'])
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    if variant["use_gpu"]:
        gpu_id = variant["gpu_id"]
        ptu.set_gpu_mode(True)
        ptu.set_device(gpu_id)
        algorithm.to(ptu.device)
        env._wrapped_env.vae.to(ptu.device)
    algorithm.train()
Beispiel #4
0
def experiment(variant):
    if variant['multitask']:
        env = CylinderXYPusher2DEnv(**variant['env_kwargs'])
        env = MultitaskToFlatEnv(env)
    else:
        env = Pusher2DEnv(**variant['env_kwargs'])
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #5
0
def experiment(variant):
    env = variant['env_class']()
    if variant['multitask']:
        env = MultitaskToFlatEnv(env)

    qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=int(np.prod(env.observation_space.shape)),
        output_size=env.action_space.n,
    )
    qf_criterion = variant['qf_criterion_class']()
    algorithm = variant['algo_class'](env,
                                      qf=qf,
                                      qf_criterion=qf_criterion,
                                      **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
def td3_experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    env = MultitaskToFlatEnv(env)
    if variant.get('make_silent_env', True):
        env = MultitaskEnvToSilentMultitaskEnv(env)
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant['qf_kwargs'])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim,
                     output_size=1,
                     **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    from cheetah_env import HalfCheetahEnvNew
    from cost_functions import cheetah_cost_fn, \
        hopper_cost_fn, \
        swimmer_cost_fn
    from hopper_env import HopperEnvNew
    from main_solution import train_dagger
    from railrl.core import logger
    from swimmer_env import SwimmerEnvNew
    env_name_or_class = variant['env_name_or_class']

    if type(env_name_or_class) == str:
        if 'cheetah' in str.lower(env_name_or_class):
            env = HalfCheetahEnvNew()
            cost_fn = cheetah_cost_fn
        elif 'hopper' in str.lower(env_name_or_class):
            env = HopperEnvNew()
            cost_fn = hopper_cost_fn
        elif 'swimmer' in str.lower(env_name_or_class):
            env = SwimmerEnvNew()
            cost_fn = swimmer_cost_fn
        else:
            raise NotImplementedError
    else:
        env = env_name_or_class()
        from railrl.envs.wrappers import NormalizedBoxEnv
        env = NormalizedBoxEnv(env)
        if env_name_or_class == Pusher2DEnv:
            cost_fn = pusher2d_cost_fn
        elif env_name_or_class == Reacher7Dof:
            cost_fn = reacher7dof_cost_fn
        elif env_name_or_class == HalfCheetah:
            cost_fn = half_cheetah_cost_fn
        else:
            if variant['multitask']:
                env = MultitaskToFlatEnv(env)
            cost_fn = env.cost_fn

    train_dagger(env=env,
                 cost_fn=cost_fn,
                 logdir=logger.get_snapshot_dir(),
                 **variant['dagger_params'])
Beispiel #8
0
def experiment(variant):
    env = SawyerXYZEnv(**variant['env_kwargs'])
    env = MultitaskToFlatEnv(env)
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    env = variant['env_class']()
    if variant['multitask']:
        env = MultitaskToFlatEnv(env)
    env = ConvertEnvToRllab(env)

    policy = CategoricalMLPPolicy(
        env_spec=env.spec,
        **variant['policy_kwargs'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
        #     **optimizer_params
        # )),
        **variant['trpo_params'])
    algo.train()
Beispiel #10
0
def experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    if variant['multitask']:
        env = MultitaskToFlatEnv(env)
    env = NormalizedBoxEnv(env)
    env = ConvertEnvToTf(env)

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               **variant['policy_params'])

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    optimizer_params = variant['optimizer_params']
    algo_kwargs = variant['algo_kwargs']
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(**optimizer_params)),
                **algo_kwargs)
    algo.train()
def experiment(variant):
    rdim = variant["rdim"]
    vae_paths = {
        2:
        "/home/ashvin/data/s3doodad/ashvin/vae/point2d-conv-sweep2/run0/id1/params.pkl",
        4:
        "/home/ashvin/data/s3doodad/ashvin/vae/point2d-conv-sweep2/run0/id4/params.pkl"
    }
    vae_path = vae_paths[rdim]
    vae = joblib.load(vae_path)
    print("loaded", vae_path)

    if variant['multitask']:
        env = MultitaskImagePoint2DEnv(**variant['env_kwargs'])
        env = VAEWrappedEnv(env,
                            vae,
                            use_vae_obs=True,
                            use_vae_reward=False,
                            use_vae_goals=False)
        env = MultitaskToFlatEnv(env)
    # else:
    # env = Pusher2DEnv(**variant['env_kwargs'])
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    training_env=env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    print("use_gpu", variant["use_gpu"], bool(variant["use_gpu"]))
    if variant["use_gpu"]:
        gpu_id = variant["gpu_id"]
        ptu.set_gpu_mode(True)
        ptu.set_device(gpu_id)
        algorithm.to(ptu.device)
        env._wrapped_env.vae.to(ptu.device)
    algorithm.train()
Beispiel #12
0
def experiment(variant):
    rdim = variant["rdim"]
    vae_paths = {
        2:
        "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id0/params.pkl",
        4:
        "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id1/params.pkl",
        8:
        "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id2/params.pkl",
        16:
        "/home/ashvin/data/s3doodad/ashvin/vae/new-pusher2d/run2/id3/params.pkl"
    }
    vae_path = vae_paths[rdim]
    vae = torch.load(vae_path)
    print("loaded", vae_path)

    if variant['multitask']:
        env = FullPusher2DEnv(**variant["env_kwargs"])
        env = ImageMujocoEnv(env,
                             84,
                             camera_name="topview",
                             transpose=True,
                             normalize=True)
        env = VAEWrappedImageGoalEnv(env,
                                     vae,
                                     use_vae_obs=True,
                                     use_vae_reward=True,
                                     use_vae_goals=True,
                                     render_goals=True,
                                     render_rollouts=True,
                                     track_qpos_goal=5)
        env = MultitaskToFlatEnv(env)
    # else:
    # env = Pusher2DEnv(**variant['env_kwargs'])
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    training_env=env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    print("use_gpu", variant["use_gpu"], bool(variant["use_gpu"]))
    if variant["use_gpu"]:
        gpu_id = variant["gpu_id"]
        ptu.set_gpu_mode(True)
        ptu.set_device(gpu_id)
        algorithm.to(ptu.device)
        env._wrapped_env.vae.to(ptu.device)
    algorithm.train()
def experiment(variant):
    rdim = variant["rdim"]
    use_env_goals = variant["use_env_goals"]
    vae_path = variant["vae_paths"][str(rdim)]
    render = variant["render"]
    wrap_mujoco_env = variant.get("wrap_mujoco_env", False)

    # vae = torch.load(vae_path)
    # print("loaded", vae_path)

    from railrl.envs.wrappers import ImageMujocoEnv, NormalizedBoxEnv
    from railrl.images.camera import sawyer_init_camera

    env = variant["env"](**variant['env_kwargs'])
    env = NormalizedBoxEnv(ImageMujocoEnv(
        env,
        imsize=84,
        keep_prev=0,
        init_camera=sawyer_init_camera,
    ))
    if wrap_mujoco_env:
        env = ImageMujocoEnv(env, 84, camera_name="topview", transpose=True, normalize=True)


    if use_env_goals:
        track_qpos_goal = variant.get("track_qpos_goal", 0)
        env = VAEWrappedImageGoalEnv(env, vae_path, use_vae_obs=True,
                                     use_vae_reward=True, use_vae_goals=True,
                                     render_goals=render, render_rollouts=render, track_qpos_goal=track_qpos_goal)
    else:
        env = VAEWrappedEnv(env, vae_path, use_vae_obs=True,
                            use_vae_reward=True, use_vae_goals=True,
                            render_goals=render, render_rollouts=render)

    env = MultitaskToFlatEnv(env)
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(
        env,
        training_env=env,
        qf1=qf1,
        qf2=qf2,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_kwargs']
    )
    algorithm.to(ptu.device)
        env._wrapped_env.vae.to(ptu.device)
import sys

from railrl.envs.mujoco.sawyer_push_and_reach_env import \
    SawyerPushAndReachXYEnv, SawyerPushAndReachXYEasyEnv
from railrl.envs.multitask.multitask_env import MultitaskToFlatEnv
from railrl.exploration_strategies.base import \
    PolicyWrappedWithExplorationStrategy
from railrl.exploration_strategies.epsilon_greedy import EpsilonGreedy
from railrl.exploration_strategies.ou_strategy import OUStrategy
from railrl.policies.simple import ZeroPolicy
import numpy as np

print("making env")
# env = SawyerPushAndReachXYEasyEnv()
env = SawyerPushAndReachXYEnv()
env = MultitaskToFlatEnv(env)

policy = ZeroPolicy(env.action_space.low.size)
es = OUStrategy(env.action_space, theta=1)
es = EpsilonGreedy(
    action_space=env.action_space,
    prob_random_action=0.1,
)
policy = exploration_policy = PolicyWrappedWithExplorationStrategy(
    exploration_strategy=es,
    policy=policy,
)
print("starting rollout")

import pygame
from pygame.locals import QUIT, KEYDOWN