Esempio n. 1
0
def experiment(variant):
    env = SawyerXYZEnv(**variant['env_kwargs'])
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    goal_dim = env.goal_dim
    qf = FlattenMlp(input_size=obs_dim + action_dim + goal_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    vf = FlattenMlp(input_size=obs_dim + goal_dim,
                    output_size=1,
                    **variant['vf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    replay_buffer = SimpleHerReplayBuffer(env=env,
                                          **variant['replay_buffer_kwargs'])
    algorithm = HerSac(env=env,
                       policy=policy,
                       qf=qf,
                       vf=vf,
                       replay_buffer=replay_buffer,
                       **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 2
0
def experiment(variant):
    env = SawyerXYZEnv(**variant['env_kwargs'])
    env = MultitaskToFlatEnv(env)
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(
        env,
        qf1=qf1,
        qf2=qf2,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 3
0
def experiment(variant):
    env = SawyerXYZEnv(**variant['env_kwargs'])
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    goal_dim = env.goal_dim
    qf = FlattenMlp(
        input_size=obs_dim + action_dim + goal_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim + goal_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = SimpleHerReplayBuffer(
        env=env,
        **variant['replay_buffer_kwargs']
    )
    algorithm = HerDdpg(
        env,
        qf=qf,
        policy=policy,
        exploration_policy=exploration_policy,
        replay_buffer=replay_buffer,
        **variant['algo_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 4
0
def experiment(variant):
    env = SawyerXYZEnv(**variant['env_kwargs'])
    env = MultitaskToFlatEnv(env)
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 5
0
from railrl.envs.mujoco.sawyer_gripper_env import SawyerPushXYEnv, \
    SawyerPushEnv, SawyerXYZEnv
from railrl.envs.mujoco.sawyer_kitchen import KitchenCabinetEnv
from railrl.envs.wrappers import ImageMujocoEnv

from railrl.exploration_strategies.base import \
    PolicyWrappedWithExplorationStrategy
from railrl.exploration_strategies.epsilon_greedy import EpsilonGreedy
from railrl.exploration_strategies.ou_strategy import OUStrategy
from railrl.policies.simple import ZeroPolicy
import numpy as np

print("making env")
# env = SawyerPushXYEnv(randomize_goals=True, frame_skip=50)
env = SawyerPushEnv(randomize_goals=False, frame_skip=50)
env = SawyerXYZEnv(frame_skip=50, pos_action_scale=2./100)
env = SawyerPushXYEnv(frame_skip=50, pos_action_scale=2./100)
# env = KitchenCabinetEnv()
from railrl.images.camera import sawyer_init_camera
# env = ImageMujocoEnv(
#         env,
#         init_camera=sawyer_init_camera,
#     )
# env.enable_render()

policy = ZeroPolicy(env.action_space.low.size)
es = OUStrategy(
    env.action_space,
    theta=1
)
es = EpsilonGreedy(
Esempio n. 6
0
def experiment(variant):
    feat_points = 16
    history = 1
    latent_obs_dim = feat_points * 2 * history
    imsize = 64
    downsampled_size = 32

    env = SawyerXYZEnv()
    extra_fc_size = env.obs_dim
    env = ImageMujocoWithObsEnv(env,
                                imsize=imsize,
                                normalize=True,
                                grayscale=True,
                                keep_prev=history - 1,
                                init_camera=camera.sawyer_init_camera)
    """env = ImageMujocoEnv(env,
                        imsize=imsize,
                        keep_prev=history-1,
                        init_camera=camera.sawyer_init_camera)"""

    es = GaussianStrategy(action_space=env.action_space, )
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    ae = FeatPointMlp(input_size=imsize,
                      downsample_size=downsampled_size,
                      input_channels=1,
                      num_feat_points=feat_points)
    replay_buffer = AEEnvReplayBuffer(int(1e4),
                                      env,
                                      imsize=imsize,
                                      history_length=history,
                                      downsampled_size=downsampled_size)

    qf = FlattenMlp(input_size=latent_obs_dim + extra_fc_size + action_dim,
                    output_size=1,
                    hidden_sizes=[400, 300])
    policy = AETanhPolicy(
        input_size=latent_obs_dim + extra_fc_size,
        ae=ae,
        env=env,
        history_length=history,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )

    algorithm = FeatPointDDPG(ae,
                              history,
                              env=env,
                              qf=qf,
                              policy=policy,
                              exploration_policy=exploration_policy,
                              replay_buffer=replay_buffer,
                              extra_fc_size=extra_fc_size,
                              imsize=imsize,
                              downsampled_size=downsampled_size,
                              **variant['algo_params'])

    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 7
0
from railrl.envs.mujoco.sawyer_gripper_env import SawyerXYZEnv
from railrl.envs.wrappers import ImageMujocoEnv
import cv2
import numpy as np

print("making env")
sawyer = SawyerXYZEnv()
env = ImageMujocoEnv(sawyer, imsize=400)

print("starting rollout")
while True:
    obs = env.reset()
    for t in range(1000):
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        raw_img = env._image_observation()
        img = np.concatenate((
            raw_img[::-1, :, 2:3],
            raw_img[::-1, :, 1:2],
            raw_img[::-1, :, 0:1],
        ),
                             axis=2)
        cv2.imshow('obs', img)
        cv2.waitKey(1)
        # if done:
        #     break
    print("new episode")
from railrl.envs.mujoco.sawyer_gripper_env import SawyerXYZEnv

from railrl.exploration_strategies.base import \
    PolicyWrappedWithExplorationStrategy
from railrl.exploration_strategies.ou_strategy import OUStrategy
from railrl.policies.simple import ZeroPolicy

print("making env")
env = SawyerXYZEnv()
policy = ZeroPolicy(env.action_space.low.size)
es = OUStrategy(env.action_space, theta=1)
policy = exploration_policy = PolicyWrappedWithExplorationStrategy(
    exploration_strategy=es,
    policy=policy,
)
print("starting rollout")
while True:
    obs = env.reset()
    last_reward_t = 0
    returns = 0
    for t in range(1000):
        # action = env.action_space.sample()*10
        action, _ = policy.get_action(None)
        if (t // 100) % 2 == 0:
            action[3] = -10
        else:
            action[3] = 10
        obs, reward, done, info = env.step(action)
        env.render()
        print("action", action)
        if done: