def __init__(self,
              survive_reward=2e-1,
              ctrl_cost_coeff=1e-3,
              contact_cost_coeff=1e-5,
              velocity_deviation_cost_coeff=1e-2,
              *args,
              **kwargs):
     MultiDirectionBaseEnv.__init__(
         self,
         survive_reward=survive_reward,
         ctrl_cost_coeff=ctrl_cost_coeff,
         contact_cost_coeff=contact_cost_coeff,
         velocity_deviation_cost_coeff=velocity_deviation_cost_coeff,
         *args,
         **kwargs)
     HumanoidEnv.__init__(
         self,
         # survive_reward=survive_reward,
         alive_bonus=survive_reward,  # TODO: remove this
         ctrl_cost_coeff=ctrl_cost_coeff,
         # contact_cost_coeff=contact_cost_coeff,
         impact_cost_coeff=contact_cost_coeff,  # TODO: remove this
         vel_deviation_cost_coeff=
         velocity_deviation_cost_coeff,  # TODO: remove this
         *args,
         **kwargs)
Ejemplo n.º 2
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        env = normalize(SwimmerEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(
        min_pool_size=variant['max_path_length'],
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
    )
    df = DFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M])  # discriminator, input is the actions.
    vf = VFunction(env_spec=env.spec, hidden_layer_sizes=[M, M])

    policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M))

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=pool,
        qf=qf,
        policy=policy,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=16,
        kernel_update_ratio=0.5,
        value_n_particles=16,
        td_target_update_interval=1000,
        qf_lr=variant['qf_lr'],
        policy_lr=variant['policy_lr'],
        discount=variant['discount'],
        reward_scale=variant['reward_scale'],
        save_full_state=False,
        df=df,
        vf=vf,
        df_lr=1e-3,
        dist=variant['dist'],
    )

    algorithm.train()
Ejemplo n.º 3
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        env = normalize(SwimmerEnv())
    elif variant['env_name'] == 'ant-rllab':
        env = normalize(AntEnv())
    elif variant['env_name'] == 'BlocksSimpleXYQ-v0':
        target = [-1.0, 0.0]
        env = bsmp.BlocksSimpleXYQ(multi_goal=variant['blocks_multigoal'],
                                   time_limit=variant['max_path_length'],
                                   env_config=variant['blocks_simple_xml'],
                                   goal=target)
        env = env_wrap.obsTupleWrap(env, add_action_to_obs=False)
        env = gym_env.GymEnv(
            env,
            video_schedule=glob.video_scheduler.video_schedule,
            log_dir=".")
    else:
        env = normalize(GymEnv(variant['env_name']))

    pool = SimpleReplayBuffer(env=env,
                              max_replay_buffer_size=variant['max_pool_size'])

    sampler = SimpleSampler(max_path_length=variant['max_path_length'],
                            min_pool_size=variant['max_path_length'],
                            batch_size=variant['batch_size'])

    base_kwargs = dict(epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       n_train_repeat=variant['n_train_repeat'],
                       eval_render=False,
                       eval_n_episodes=1,
                       sampler=sampler)

    M = variant['layer_size']
    qf = NNQFunction(env=env, hidden_layer_sizes=(M, M))

    policy = StochasticNNPolicy(env=env, hidden_layer_sizes=(M, M))

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=pool,
        qf=qf,
        policy=policy,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=variant['kernel_particles'],
        kernel_update_ratio=variant['kernel_update_ratio'],
        value_n_particles=variant['value_n_particles'],
        td_target_update_interval=variant['td_target_update_interval'],
        qf_lr=variant['qf_lr'],
        policy_lr=variant['policy_lr'],
        discount=variant['discount'],
        reward_scale=variant['reward_scale'],
        save_full_state=False)

    algorithm.train()
            def run_task(*_):

                env = normalize(HumanoidEnv())

                policy = DeterministicMLPPolicy(
                    env_spec=env.spec,
                    # The neural network policy should have two hidden layers, each with 32 hidden units.
                    hidden_sizes=(32, 32))

                es = OUStrategy(env_spec=env.spec)

                qf = ContinuousMLPQFunction(env_spec=env.spec,
                                            hidden_sizes=(32, 32))
                """
                Persistence Length Exploration
                """
                lp = Persistence_Length_Exploration(
                    env=env,
                    qf=qf,
                    policy=policy,
                    L_p=L_p_param[l_p_ind],
                    b_step_size=b_step_size[b_ind],
                    sigma=sigma_param[s_ind],
                    max_exploratory_steps=max_exploratory_steps_iters,
                    batch_size=batch_size_value,
                    n_epochs=num_episodes,
                    scale_reward=0.01,
                    epoch_length=steps_per_episode,
                    qf_learning_rate=0.001,
                    policy_learning_rate=0.0001,
                )
                """
                DDPG
                """

                algo = DDPG(
                    env=env,
                    policy=policy,
                    es=es,
                    qf=qf,
                    lp=lp,
                    batch_size=batch_size_value,
                    max_path_length=100,
                    epoch_length=steps_per_episode,
                    min_pool_size=10000,
                    n_epochs=num_episodes,
                    discount=0.99,
                    scale_reward=0.01,
                    qf_learning_rate=0.001,
                    policy_learning_rate=0.0001,
                    # Uncomment both lines (this and the plot parameter below) to enable plotting
                    # plot=True,
                )
                algo.train()
Ejemplo n.º 5
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        env = normalize(SwimmerEnv())
    elif variant['env_name'] == 'ant-rllab':
        env = normalize(AntEnv())
    elif variant['env_name'] == 'sawyer-rllab':
        env = normalize(SawyerTestEnv())
    elif variant['env_name'] == 'arm3Ddisc-rllab':
        env = normalize(Arm3dDiscEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))

    pool = SimpleReplayBuffer(
        env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'])

    sampler = SimpleSampler(
        max_path_length=variant['max_path_length'],
        min_pool_size=variant['max_path_length'],
        batch_size=variant['batch_size'])

    base_kwargs = dict(
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        sampler=sampler)

    M = variant['layer_size']
    qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M))

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=pool,
        qf=qf,
        policy=policy,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=variant['kernel_particles'],
        kernel_update_ratio=variant['kernel_update_ratio'],
        value_n_particles=variant['value_n_particles'],
        td_target_update_interval=variant['td_target_update_interval'],
        qf_lr=variant['qf_lr'],
        policy_lr=variant['policy_lr'],
        discount=variant['discount'],
        reward_scale=variant['reward_scale'],
        save_full_state=False)

    algorithm.train()
Ejemplo n.º 6
0
def get(perm):
    name = perm["problem"]
    if name.lower() == "cartpole":
        from rllab.envs.box2d.cartpole_env import CartpoleEnv
        return normalize(CartpoleEnv())

    elif name.lower() == "mountain car height bonus":
        from rllab.envs.box2d.mountain_car_env import MountainCarEnv
        return normalize(MountainCarEnv())

    elif name.lower() == "mountain car":
        from rllab.envs.box2d.mountain_car_env import MountainCarEnv
        return normalize(MountainCarEnv(height_bonus=0))

    elif name.lower() == "gym mountain car":
        from rllab.envs.gym_env import GymEnv
        return normalize(GymEnv("MountainCarContinuous-v0",
                                record_video=False))

    elif name.lower() == "pendulum":
        from rllab.envs.gym_env import GymEnv
        return normalize(GymEnv("Pendulum-v0", record_video=False))

    elif name.lower() == "mujoco double pendulum":
        from rllab.envs.mujoco.inverted_double_pendulum_env import InvertedDoublePendulumEnv
        return normalize(InvertedDoublePendulumEnv())

    elif name.lower() == "double pendulum":
        from rllab.envs.box2d.double_pendulum_env import DoublePendulumEnv
        return normalize(DoublePendulumEnv())

    elif name.lower() == "hopper":
        from rllab.envs.mujoco.hopper_env import HopperEnv
        return normalize(HopperEnv())

    elif name.lower() == "swimmer":
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        return normalize(SwimmerEnv())

    elif name.lower() == "2d walker":
        from rllab.envs.mujoco.walker2d_env import Walker2DEnv
        return normalize(Walker2DEnv())

    elif name.lower() == "half cheetah":
        from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv
        return normalize(HalfCheetahEnv())

    elif name.lower() == "ant":
        from rllab.envs.mujoco.ant_env import AntEnv
        return normalize(AntEnv())

    elif name.lower() == "simple humanoid":
        from rllab.envs.mujoco.simple_humanoid_env import SimpleHumanoidEnv
        return normalize(SimpleHumanoidEnv())

    elif name.lower() == "full humanoid":
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        return normalize(HumanoidEnv())

    else:
        raise NotImplementedError(f"Environment {name} unknown")
Ejemplo n.º 7
0
                    help='eval interval (step)')
parser.add_argument('--ckpt-interval', type=int, default=499999,
                    help='checkpoint interval (step)')
parser.add_argument('--gpu', type=int, default=4,
                    help='run on CUDA (default: False)')

args = parser.parse_args()
args.hadamard = bool(args.hadamard)
if args.gpu >= 0:
    print("gpu ok")
    ptu.set_gpu_mode(True, args.gpu)
# set env
if args.env_name == 'Humanoidrllab':
    from rllab.envs.mujoco.humanoid_env import HumanoidEnv
    from rllab.envs.normalized_env import normalize
    env = normalize(HumanoidEnv())
    max_episode_steps = float('inf')
    if args.seed >= 0:
        global seed_
        seed_ = args.seed
else:
    env = gym.make(args.env_name)
    max_episode_steps=env._max_episode_steps
    env=NormalizedActions(env)
    if args.seed >= 0:
        env.seed(args.seed)
if args.seed >= 0:
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
Ejemplo n.º 8
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))
    env = DelayedEnv(env, delay=0.01)

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    sampler = RemoteSampler(
        max_path_length=variant['max_path_length'],
        min_pool_size=variant['max_path_length'],
        batch_size=variant['batch_size']
    )

    base_kwargs = dict(
        sampler=sampler,
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=env.spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reparameterize=variant['reparameterize'],
        reg=0.001,
    )
    

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        pool=pool,
        qf=qf,
        vf=vf,

        lr=variant['lr'],
        scale_reward=variant['scale_reward'],
        discount=variant['discount'],
        tau=variant['tau'],

        reparameterize=variant['reparameterize'],
        save_full_state=False,
    )

    algorithm.train()
Ejemplo n.º 9
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    elif variant["env_name"] == "Point2D-v0":
        import sac.envs.point2d_env
        env = GymEnv(variant["env_name"])
    else:
        env = normalize(GymEnv(variant['env_name']))

    obs_space = env.spec.observation_space
    assert isinstance(obs_space, spaces.Box)
    low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)])
    high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)])
    aug_obs_space = spaces.Box(low=low, high=high)
    aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space)
    pool = SimpleReplayBuffer(
        env_spec=aug_env_spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(min_pool_size=variant['max_path_length'],
                       epoch_length=variant['epoch_length'],
                       n_epochs=variant['n_epochs'],
                       max_path_length=variant['max_path_length'],
                       batch_size=variant['batch_size'],
                       n_train_repeat=variant['n_train_repeat'],
                       eval_render=False,
                       eval_n_episodes=1,
                       eval_deterministic=True,
                       sampler=SimpleSampler(
                           max_path_length=variant["max_path_length"],
                           min_pool_size=variant["max_path_length"],
                           batch_size=variant["batch_size"]))

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GaussianPolicy(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
        reg=0.001,
    )

    # policy = GMMPolicy(
    #     env_spec=aug_env_spec,
    #     K=variant['K'],
    #     hidden_layer_sizes=[M, M],
    #     qf=qf,
    #     reg=0.001,
    # )

    discriminator = NNDiscriminatorFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
        num_skills=variant['num_skills'],
    )

    algorithm = DIAYN(base_kwargs=base_kwargs,
                      env=env,
                      policy=policy,
                      discriminator=discriminator,
                      pool=pool,
                      qf=qf,
                      vf=vf,
                      lr=variant['lr'],
                      scale_entropy=variant['scale_entropy'],
                      discount=variant['discount'],
                      tau=variant['tau'],
                      num_skills=variant['num_skills'],
                      save_full_state=False,
                      include_actions=variant['include_actions'],
                      learn_p_z=variant['learn_p_z'],
                      add_p_z=variant['add_p_z'],
                      reparametrize=variant["reparametrize"])

    algorithm.train()
Ejemplo n.º 10
0
def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        from rllab.envs.mujoco.humanoid_env import HumanoidEnv
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        from rllab.envs.mujoco.swimmer_env import SwimmerEnv
        env = normalize(SwimmerEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))

    obs_space = env.spec.observation_space
    assert isinstance(obs_space, spaces.Box)
    low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)])
    high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)])
    aug_obs_space = spaces.Box(low=low, high=high)
    aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space)
    pool = SimpleReplayBuffer(
        env_spec=aug_env_spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(
        min_pool_size=variant['max_path_length'],
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
        eval_deterministic=True,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    vf = NNVFunction(
        env_spec=aug_env_spec,
        hidden_layer_sizes=[M, M],
    )

    policy = GMMPolicy(
        env_spec=aug_env_spec,
        K=variant['K'],
        hidden_layer_sizes=[M, M],
        qf=qf,
        reg=0.001,
    )

    discriminator = NNDiscriminatorFunction(
        env_spec=env.spec,
        hidden_layer_sizes=[M, M],
        num_skills=variant['num_skills'],
    )


    algorithm = DIAYN_BD(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        discriminator=discriminator,
        pool=pool,
        qf=qf,
        vf=vf,

        lr=variant['lr'],
        scale_entropy=variant['scale_entropy'],
        discount=variant['discount'],
        tau=variant['tau'],
        num_skills=variant['num_skills'],
        save_full_state=False,
        include_actions=variant['include_actions'],
        learn_p_z=variant['learn_p_z'],
        add_p_z=variant['add_p_z'],

        # Additional params for behaviour tracking
        metric=variant['metric'],
        env_id=variant['prefix'],
        eval_freq=variant['eval_freq'],
        log_dir=get_logdir(args, variant),

    )

    algorithm.train()