def run_task(*_):

    env = normalize(
        GymEnv(env_name="MountainCarContinuous-v0", force_reset=True))
    max_path_length = 300

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers
        hidden_sizes=(64, 64))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64))

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=100,
        n_updates_per_sample=1,
        max_path_length=max_path_length,
        epoch_length=900,
        min_pool_size=800,
        replay_pool_size=5000,
        n_epochs=1000,
        discount=0.99,
        scale_reward=0.1,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
    )
    algo.train()
Beispiel #2
0
def main(exp_name, ent_wt=1.0):
    register_custom_envs()
    env_name = 'LunarLanderContinuous-v3'
    env = GymEnv(env_name)
    policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))
    es = OUStrategy(env_spec=env.spec)
    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=350,
        epoch_length=350,
        min_pool_size=350,
        n_epochs=600,
        discount=0.99,
        scale_reward=1.0/140.0,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    data_path = 'data/%s_data_rllab_%s/%s/'%(env_name.replace('-', '_'), 
                                             str(algo.__class__.__name__), 
                                             exp_name)
    os.makedirs(data_path, exist_ok=True)
    logger.set_snapshot_dir(data_path)
    algo.train()
    logger.set_snapshot_dir(None)
Beispiel #3
0
def run_task(*_):
    env = normalize(SwimmerEnv())

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=100,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=200,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )
    algo.train()
Beispiel #4
0
            def run_task(*_):
                env = normalize(Walker2DEnv())

                policy = DeterministicMLPPolicy(
                    env_spec=env.spec,
                    # The neural network policy should have two hidden layers, each with 32 hidden units.
                    hidden_sizes=(H_layer_first[h], H_layer_second[h])
                )

                es = OUStrategy(env_spec=env.spec)

                qf = ContinuousMLPQFunction(env_spec=env.spec)

                algo = DDPG(
                    env=env,
                    policy=policy,
                    es=es,
                    qf=qf,
                    batch_size=size_of_batch,
                    max_path_length=100,
                    epoch_length=1000,
                    min_pool_size=10000,
                    n_epochs=number_of_episodes,
                    discount=discount_factor,
                    scale_reward=reward_scaling[r],
                    qf_learning_rate=critic_learning_rate[c],
                    policy_learning_rate=actor_learning_rate[c],
                    # Uncomment both lines (this and the plot parameter below) to enable plotting
                    # plot=True,
                )
                algo.train()
def run_task(*_):
    """
    DPG on Swimmer environment
    """
    env = normalize(SwimmerEnv())
    """
    Initialise the policy as a neural network policy
    """
    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))
    """
    Defining exploration strategy : OUStrategy - 
    """
    """
    This strategy implements the Ornstein-Uhlenbeck process, which adds
    time-correlated noise to the actions taken by the deterministic policy.
    The OU process satisfies the following stochastic differential equation:
    dxt = theta*(mu - xt)*dt + sigma*dWt
    where Wt denotes the Wiener process
    """
    es = OUStrategy(env_spec=env.spec)
    """
    Defining the Q network
    """
    qf = ContinuousMLPQFunction(env_spec=env.spec)

    w = qf.get_param_values(regularizable=True)
    """
    Persistence Length Exploration
    """
    lp = Persistence_Length_Exploration(env=env, qf=qf, policy=policy)
    """
    Using the DDPG algorithm
    """
    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        lp=lp,
        batch_size=32,
        max_path_length=1000,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=15000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        #Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )
    """
    Training the networks based on the DDPG algorithm
    """
    algo.train()
Beispiel #6
0
            def run_task(*_):

                env = normalize(SimpleHumanoidEnv())
                # env = SimpleHumanoidEnv()

                policy = DeterministicMLPPolicy(
                    env_spec=env.spec,
                    # The neural network policy should have two hidden layers, each with 32 hidden units.
                    hidden_sizes=(32, 32))

                es = OUStrategy(env_spec=env.spec)

                qf = ContinuousMLPQFunction(env_spec=env.spec,
                                            hidden_sizes=(32, 32))
                """
                Persistence Length Exploration
                """
                lp = Persistence_Length_Exploration(
                    env=env,
                    qf=qf,
                    policy=policy,
                    L_p=L_p_param[l_p_ind],
                    b_step_size=b_step_size[b_ind],
                    sigma=sigma_param[s_ind],
                    max_exploratory_steps=max_exploratory_steps_iters,
                    batch_size=batch_size_value,
                    n_epochs=num_episodes,
                    scale_reward=0.01,
                    epoch_length=steps_per_episode,
                    qf_learning_rate=0.001,
                    policy_learning_rate=0.0001,
                )
                """
                DDPG
                """

                algo = DDPG(
                    env=env,
                    policy=policy,
                    es=es,
                    qf=qf,
                    lp=lp,
                    batch_size=batch_size_value,
                    max_path_length=100,
                    epoch_length=steps_per_episode,
                    min_pool_size=10000,
                    n_epochs=num_episodes,
                    discount=0.99,
                    scale_reward=0.01,
                    qf_learning_rate=0.001,
                    policy_learning_rate=0.0001,
                    # Uncomment both lines (this and the plot parameter below) to enable plotting
                    # plot=True,
                )
                algo.train()
Beispiel #7
0
def run_task(*_):
    """
    DPG on Hopper environment
    """
    env = normalize(HopperEnv())

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(400, 300))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)
    """
    Using the DDPG algorithm
    """
    # algo = DDPG(
    #     env=env,
    #     policy=policy,
    #     es=es,
    #     qf=qf,
    #     batch_size=32,
    #     max_path_length=500,
    #     epoch_length=500,
    #     min_pool_size=10000,
    #     n_epochs=20000,
    #     discount=0.99,
    #     scale_reward=0.01,
    #     qf_learning_rate=1e-3,
    #     policy_learning_rate=1e-4,
    #     #Uncomment both lines (this and the plot parameter below) to enable plotting
    #     plot=True,
    # )

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=64,
        max_path_length=1000,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=10000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=10e-3,
        policy_learning_rate=10e-4,
        #Uncomment both lines (this and the plot parameter below) to enable plotting
        plot=True,
    )

    algo.train()
Beispiel #8
0
def run_task(*_):

    f = open('/home/qingkai/ddpg_performance.csv', "w+")

    env = PointGatherEnv(apple_reward=10,
                         bomb_cost=1,
                         n_apples=2,
                         activity_range=6)

    policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)
    qf_cost = ContinuousMLPQFunction(env_spec=env.spec)

    safety_constraint = GatherSafetyConstraint(max_value=0.2)

    algo = PDO_DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        qf_cost=qf_cost,
        dual_var=0,
        safety_constraint=safety_constraint,
        batch_size=64,
        max_path_length=15,
        epoch_length=10000,
        min_pool_size=10000,
        n_epochs=150,
        discount=0.99,
        qf_learning_rate=1e-3,
        qf_cost_learning_rate=1e-3,
        dual_learning_rate=1e-2,
        policy_learning_rate=1e-3,
        scale_reward=1,
        scale_cost=5,
        soft_target=True,
        soft_target_tau=0.001,
        eval_samples=10000,
        qf_weight_decay=0.,
        qf_cost_weight_decay=0.,
        avg_horizon=100000,
        #plot=True,
    )

    algo.train()
    f.close()
Beispiel #9
0
def test_ddpg():
    env = CartpoleEnv()
    policy = DeterministicMLPPolicy(env.spec)
    qf = ContinuousMLPQFunction(env.spec)
    es = OUStrategy(env.spec)
    algo = DDPG(
        env=env, policy=policy, qf=qf, es=es,
        n_epochs=1,
        epoch_length=100,
        batch_size=32,
        min_pool_size=50,
        replay_pool_size=1000,
        eval_samples=100,
    )
    algo.train()
def test_rllab(patient_id=1, Initial_Bg=0):
    try:
        from rllab.algos.ddpg import DDPG
        from rllab.envs.normalized_env import normalize
        from rllab.exploration_strategies.ou_strategy import OUStrategy
        from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy
        from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
        from rllab.envs.gym_env import GymEnv
    except ImportError:
        print('rllab is not installed!')
        return None

    env = GymEnv('simglucose-adult{}-CHO{}-v0'.format(Initial_Bg,
                                                      patient_id + 1))
    env = normalize(env)

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each
        # with 32 hidden units.
        hidden_sizes=(32, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(env=env,
                policy=policy,
                es=es,
                qf=qf,
                batch_size=32,
                max_path_length=100,
                epoch_length=1000,
                min_pool_size=10000,
                n_epochs=5,
                discount=0.99,
                scale_reward=0.01,
                qf_learning_rate=1e-3,
                policy_learning_rate=1e-4)
    algo.train()

    # env.close()

    return es, policy
Beispiel #11
0
def run_task(*_):
    env = normalize(GymEnv(args.env, force_reset=True, record_video=False))
    env.wrapped_env.env.env.reward_flag = args.reward

    if args.hidden_sizes == 0:
        hidden_sizes=(8,)
    elif args.hidden_sizes == 1:
        hidden_sizes=(32, 32)
    elif args.hidden_sizes == 2:
        hidden_sizes=(100, 50, 25)
    elif args.hidden_sizes == 3:
        hidden_sizes=(400, 300)

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=hidden_sizes
    )

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=64,
        max_path_length=95,
        epoch_length=args.batch_size,
        min_pool_size=10000,
        n_epochs=args.n_itr,
        discount=args.gamma,
        scale_reward=args.scale_reward,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        eval_samples=95,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Beispiel #12
0
def run_task(*_):
    # env = normalize(HalfCheetahEnv())

    env = normalize(GymEnv(env_name = "LunarLanderContinuous-v2",force_reset=True))
    # env = normalize(GymEnv(env_name="BipedalWalker-v2", force_reset=True, record_video=True))
    max_path_length = 400
    # print("env.horizon: ",env.horizon)
    # input()
    # env._max_episode_steps = max_path_length

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers
        hidden_sizes=(64, 64)
    )

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec,
                                hidden_sizes=(64, 64)
                                )

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=max_path_length,
        train_epoch_interval=300,
        min_pool_size=500,
        replay_pool_size = 10000,
        n_updates_per_sample =1,
        n_steps = 75000,
        discount=0.99,
        scale_reward=0.1,
        qf_learning_rate=1e-2,
        policy_learning_rate=1e-3,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Beispiel #13
0
step_size = 0.01
learn_std = True
init_std = 1

# =========================================
# Setting the neural network architecture
# =========================================
# hidden_sizes=(8,)
hidden_sizes = (32, 32)
# hidden_sizes=(100, 50, 25)

# ===================
# Defining the policy
# ===================
policy = DeterministicMLPPolicy(
    env_spec=env.spec,
    hidden_sizes=hidden_sizes,
)

# =======================
# Defining the algorithm
# =======================
es = OUStrategy(env_spec=env.spec)

qf = ContinuousMLPQFunction(env_spec=env.spec)

algo = DDPG(
    env=env,
    policy=policy,
    es=es,
    qf=qf,
    max_path_length=96,
Beispiel #14
0
gymenv = GymEnv(args.env,
                force_reset=True,
                record_video=False,
                record_log=False)

env = normalize(gymenv)

activation_map = {
    "relu": NL.rectify,
    "tanh": NL.tanh,
    "leaky_relu": NL.LeakyRectify
}

policy = DeterministicMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=args.policy_size,
    hidden_nonlinearity=activation_map[args.policy_activation],
)

es = OUStrategy(env_spec=env.spec)

qf = ContinuousMLPQFunction(
    env_spec=env.spec,
    hidden_nonlinearity=activation_map[args.vf_activation],
    hidden_sizes=args.vf_size,
)

algo = DDPG(env=env,
            policy=policy,
            es=es,
            qf=qf,
Beispiel #15
0
    "Crouch": CrouchEnv,
    "Hop": HopEnv
}

env = normalize(envs[parsed.env](visualize=False))

# env = normalize(CartpoleEnv())
# env = normalize(GymEnv("Pendulum-v0", record_video=False, record_log=False))

if alg == "DDPG":
    qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64, 64))

    es = OUStrategy(env_spec=env.spec, theta=0.5)

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32, 32))

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=100,
        epoch_length=100,
        min_pool_size=1000,
        n_epochs=1000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
Beispiel #16
0
def run_task(*_):
    f = open('/home/qingkai/verina.csv', "w+")
    trpo_stepsize = 0.01
    trpo_subsample_factor = 0.2

    env = PointGatherEnv(apple_reward=10,
                         bomb_cost=1,
                         n_apples=2,
                         activity_range=6)

    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 32))

    baseline = GaussianMLPBaseline(
        env_spec=env.spec,
        regressor_args={
            'hidden_sizes': (64, 32),
            'hidden_nonlinearity':
            NL.tanh,
            'learn_std':
            False,
            'step_size':
            trpo_stepsize,
            'optimizer':
            ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor)
        })

    safety_constraint = GatherSafetyConstraint(max_value=0.2)

    ddpg_policy = DeterministicMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=(64, 32))

    ddpg_es = OUStrategy(env_spec=env.spec)

    ddpg_qf = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=(100, 100))
    ddpg_qf_cost = ContinuousMLPQFunction(env_spec=env.spec,
                                          hidden_sizes=(100, 100))

    offline_itr_n = 100000

    algo = PDO_OFF(
        env=env,
        policy=policy,
        baseline=baseline,
        safety_constraint=safety_constraint,
        batch_size=20000,
        max_path_length=15,
        n_itr=200,
        gae_lambda=0.95,
        discount=0.995,
        step_size=trpo_stepsize,
        optimizer_args={'subsample_factor': trpo_subsample_factor},
        ddpg_policy=ddpg_policy,
        ddpg_qf=ddpg_qf,
        ddpg_qf_cost=ddpg_qf_cost,
        ddpg_es=ddpg_es,
        ddpg_dual_var=0,
        ddpg_batch_size=64,
        ddpg_qf_learning_rate=1e-4,
        ddpg_qf_cost_learning_rate=1e-4,
        ddpg_dual_learning_rate=1e-3,
        ddpg_policy_learning_rate=1e-3,
        ddpg_scale_reward=1,
        ddpg_scale_cost=1,
        offline_itr_n=offline_itr_n,
        balance=0,
        safety_tradeoff_coeff_lr=1e-2,
        ddpg_avg_horizon=offline_itr_n,
        adjust_epoch=5,
        ddpg_qf_weight_decay=0.,
        #plot=True,
    )

    algo.train()
    f.close()
Beispiel #17
0
from rllab.exploration_strategies.ou_strategy import OUStrategy
from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
import lasagne.nonlinearities as NL

from fwmav_sim_env_maneuver import FWMAVSimEnv
import os
import rllab.misc.logger as logger
import pickle
import sys
import lasagne.init as LI

env = normalize(FWMAVSimEnv())
policy = DeterministicMLPPolicy(
    env_spec=env.spec,
    hidden_nonlinearity=NL.tanh,  #NL.rectify,LeakyRectify
    output_nonlinearity=NL.tanh,
    hidden_sizes=(32, 32),
)

es = OUStrategy(
    env_spec=env.spec, theta=0.15, sigma=0.3
)  #theta = decay rate of noise (small decay slower, fluctuate more, theta = 0.01 is about 220 steps, theta = 0.1 is about 20 steps, 0.15 is 15 step, 0.022 is 100 step), sigma = variation or the size of the noise

qf = ContinuousMLPQFunction(
    env_spec=env.spec,
    hidden_nonlinearity=NL.tanh,
    output_nonlinearity=None,
    hidden_sizes=(128, 128),
    output_W_init=LI.Uniform(-3e-6, 3e-6),
    output_b_init=LI.Uniform(-3e-6, 3e-6),
)