def run_experiment(variant):
    if variant['env_name'] == 'humanoid-rllab':
        env = normalize(HumanoidEnv())
    elif variant['env_name'] == 'swimmer-rllab':
        env = normalize(SwimmerEnv())
    else:
        env = normalize(GymEnv(variant['env_name']))

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=variant['max_pool_size'],
    )

    base_kwargs = dict(
        min_pool_size=variant['max_path_length'],
        epoch_length=variant['epoch_length'],
        n_epochs=variant['n_epochs'],
        max_path_length=variant['max_path_length'],
        batch_size=variant['batch_size'],
        n_train_repeat=variant['n_train_repeat'],
        eval_render=False,
        eval_n_episodes=1,
    )

    M = variant['layer_size']
    qf = NNQFunction(
        env_spec=env.spec,
        hidden_layer_sizes=(M, M),
    )

    policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M))

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=pool,
        qf=qf,
        policy=policy,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=32,
        kernel_update_ratio=0.5,
        value_n_particles=16,
        td_target_update_interval=1000,
        qf_lr=variant['qf_lr'],
        policy_lr=variant['policy_lr'],
        discount=variant['discount'],
        reward_scale=variant['reward_scale'],
        save_full_state=False,
    )

    algorithm.train()
Esempio n. 2
0
def run_task(*_):
    # Please note that different environments with different action spaces may require different
    # policies. For example with a Box action space, a GaussianMLPPolicy works, but for a Discrete
    # action space may need to use a CategoricalMLPPolicy (see the trpo_gym_cartpole.py example)
    env = normalize(GymEnv("Pendulum-v0"))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Esempio n. 3
0
def run_task(*_):
    env = normalize(CartpoleEnv())

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)

    algo = DDPG(
        env=env,
        policy=policy,
        es=es,
        qf=qf,
        batch_size=32,
        max_path_length=100,
        epoch_length=1000,
        min_pool_size=10000,
        n_epochs=1000,
        discount=0.99,
        scale_reward=0.01,
        qf_learning_rate=1e-3,
        policy_learning_rate=1e-4,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
Esempio n. 4
0
def test():

    env = normalize(MultiGoalEnv())

    pool = SimpleReplayBuffer(
        env_spec=env.spec,
        max_replay_buffer_size=1e6,
    )

    base_kwargs = dict(
        min_pool_size=100,
        epoch_length=100,
        n_epochs=1000,
        max_path_length=30,
        batch_size=64,
        n_train_repeat=1,
        eval_render=True,
        eval_n_episodes=10,
    )

    M = 128
    policy = StochasticNNPolicy(
        env.spec, hidden_layer_sizes=(M, M), squash=True)

    qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[M, M])

    plotter = QFPolicyPlotter(
        qf=qf,
        policy=policy,
        obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]),
        default_action=[np.nan, np.nan],
        n_samples=100)

    algorithm = SQL(
        base_kwargs=base_kwargs,
        env=env,
        pool=pool,
        qf=qf,
        policy=policy,
        plotter=plotter,
        policy_lr=3e-4,
        qf_lr=3e-4,
        value_n_particles=16,
        td_target_update_interval=1000,
        kernel_fn=adaptive_isotropic_gaussian_kernel,
        kernel_n_particles=32,
        kernel_update_ratio=0.5,
        discount=0.99,
        reward_scale=0.1,
        save_full_state=False,
    )
    algorithm.train()
Esempio n. 5
0
def run_experiment(**params):
    base_params = copy.copy(DEFAULTS)
    base_params.update(params)
    params = base_params
    pprint(params)

    grid_world = SlaveGridWorldEnv("walled_chain",
                                   max_traj_length=DEFAULTS["max_path_length"],
                                   goal_reward=params["goal_reward"])
    agent = GridWorldMasterAgent(grid_world, match_reward=params["match_reward"])
    env = normalize(SituatedConversationEnvironment(env=grid_world, b_agent=agent))
    baseline = LinearFeatureBaseline(env)

    policy = RecurrentCategoricalPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_dims=params["policy_hidden_dims"],
            feature_network=MLPNetworkWithEmbeddings(
                "feature_network", env.observation_space.flat_dim,
                params["feature_dim"], params["feature_hidden_dims"],
                tf.tanh, tf.tanh, agent.vocab_size, params["embedding_dim"]),
            state_include_action=False,
    )

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))

    algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=params["batch_size"],
            max_path_length=params["max_path_length"],
            n_itr=params["n_itr"],
            discount=0.99,
            step_size=params["step_size"],
            optimizer=optimizer,
    )

    run_experiment_lite(
            algo.train(),
            n_parallel=15,
            snapshot_mode="last",
            exp_prefix="grid_world_sweep3",
            variant=params,
    )
Esempio n. 6
0
def run_task(*_):
    env = normalize(GymEnv("Pendulum-v0"))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        plot=True,
    )
    algo.train()
Esempio n. 7
0
def run_experiment(**params):
    base_params = copy.copy(DEFAULTS)
    base_params.update(params)
    params = base_params

    grid_world = SlaveGridWorldEnv("3x3", goal_reward=params["goal_reward"])
    env = normalize(grid_world)
    baseline = LinearFeatureBaseline(env)

    policy = CategoricalMLPPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_sizes=params["policy_hidden_dims"],
    )

    optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))

    algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=params["batch_size"],
            max_path_length=5,
            n_itr=params["n_itr"],
            discount=0.99,
            step_size=params["step_size"],
            optimizer=optimizer,
    )

    run_experiment_lite(
            algo.train(),
            n_parallel=5,
            snapshot_mode="last",
            exp_prefix="grid_world_silent",
            variant=params,
    )
Esempio n. 8
0
def run_task(v):
    env = normalize(CartpoleEnv())

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=100,
        n_itr=40,
        discount=0.99,
        step_size=v["step_size"],
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()
variants = VG().variants()

max_path_length = 200
num_grad_updates = 1
use_maml = True

for v in variants:
    task_var = v['task_var']
    oracle = v['oracle']

    if task_var == 0:
        task_var = 'direc'
        exp_prefix = 'bugfix_trpo_maml_antdirec' + str(max_path_length)
        if oracle:
            env = TfEnv(normalize(AntEnvDirecOracle()))
        else:
            env = TfEnv(normalize(AntEnvRandDirec()))
    elif task_var == 1:
        task_var = 'vel'
        exp_prefix = 'posticml_trpo_maml_ant' + str(max_path_length)
        if oracle:
            env = TfEnv(normalize(AntEnvOracle()))
        else:
            env = TfEnv(normalize(AntEnvRand()))
    elif task_var == 2:
        task_var = 'pos'
        exp_prefix = 'posticml_trpo_maml_antpos_' + str(max_path_length)
        if oracle:
            env = TfEnv(normalize(AntEnvRandGoalOracle()))
        else:
Esempio n. 10
0
def run_task(*_):
    """Implement the run_task method needed to run experiments with rllab."""
    v_enter = 10
    inner_length = 300
    long_length = 100
    short_length = 300
    n = 3
    m = 3
    num_cars_left = 1
    num_cars_right = 1
    num_cars_top = 1
    num_cars_bot = 1
    tot_cars = (num_cars_left + num_cars_right) * m \
        + (num_cars_bot + num_cars_top) * n

    grid_array = {
        "short_length": short_length,
        "inner_length": inner_length,
        "long_length": long_length,
        "row_num": n,
        "col_num": m,
        "cars_left": num_cars_left,
        "cars_right": num_cars_right,
        "cars_top": num_cars_top,
        "cars_bot": num_cars_bot
    }

    sumo_params = SumoParams(sim_step=1, render=True)

    vehicles = Vehicles()
    vehicles.add(veh_id="idm",
                 acceleration_controller=(SumoCarFollowingController, {}),
                 sumo_car_following_params=SumoCarFollowingParams(
                     min_gap=2.5, tau=1.1, max_speed=v_enter),
                 routing_controller=(GridRouter, {}),
                 num_vehicles=tot_cars,
                 speed_mode="all_checks")

    tl_logic = TrafficLights(baseline=False)

    additional_env_params = {
        "target_velocity": 50,
        "switch_time": 3.0,
        "num_observed": 2,
        "discrete": False,
        "tl_type": "controlled"
    }
    env_params = EnvParams(additional_params=additional_env_params)

    additional_net_params = {
        "speed_limit": 35,
        "grid_array": grid_array,
        "horizontal_lanes": 1,
        "vertical_lanes": 1
    }

    initial_config, net_params = get_flow_params(10, 300, n, m,
                                                 additional_net_params)

    scenario = SimpleGridScenario(name="grid-intersection",
                                  vehicles=vehicles,
                                  net_params=net_params,
                                  initial_config=initial_config,
                                  traffic_lights=tl_logic)

    env_name = "PO_TrafficLightGridEnv"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=40000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=800,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()
        return [2]


# should also code up alternative KL thing

variants = VG().variants()

max_path_length = 200
num_grad_updates = 1
use_maml=True

for v in variants:
    task_var = v['task_var']

    if task_var == 0:
        env = TfEnv(normalize(AntEnvRandDirec()))
        task_var = 'direc'
    elif task_var == 1:
        env = TfEnv(normalize(AntEnvRand()))
        task_var = 'vel'
    elif task_var == 2:
        env = TfEnv(normalize(AntEnvRandGoal()))
        task_var = 'pos'
    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        grad_step_size=v['fast_lr'],
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=(100,100),
    )
Esempio n. 12
0
from humanoidopt.env import HumanoidOptEnv

from rllab.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline

from rllab.envs.normalized_env import normalize
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy

env = normalize(HumanoidOptEnv())

policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32)
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=40,
    discount=0.99,
    step_size=0.01,
)
algo.train()
Esempio n. 13
0
from rllab.misc.instrument import VariantGenerator, stub, run_experiment_lite
from sandbox.rocky.tf.algos.trpo import TRPO
from sandbox.rocky.tf.core import layers as L
from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp

from praglang.environments import BagAutoencoderEnvironment
from praglang.policies import RecurrentCategoricalPolicy
from praglang.util import MLPNetworkWithEmbeddings


stub(globals())

LENGTH = 5
VOCAB = list("abcdefghijklmnopqrstuvwxyz")

env = normalize(BagAutoencoderEnvironment(VOCAB, LENGTH, "autoenc"))


DEFAULTS = {
    "batch_size": 5000,
    "n_itr": 500,
    "step_size": 0.001,
    "policy_hidden_dims": (128,),
    "embedding_dim": 32,
    "feature_dim": 128,
    "feature_hidden_dims": (),
}

config.LOG_DIR = "./log"

def run_experiment(params):
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('env_fname',
                        type=str,
                        help='config file with environment arguments')
    parser.add_argument('transformers_fname', type=str)
    parser.add_argument('mean_network_type',
                        type=str,
                        choices=['conv', 'siamese'])
    parser.add_argument('--conv_filters',
                        nargs='*',
                        type=int,
                        default=[16, 32])
    parser.add_argument('--hidden_sizes', nargs='*', type=int, default=[16])
    parser.add_argument('--init_std', type=float, default=1.0)
    parser.add_argument('--n_itr', type=int, default=100)
    parser.add_argument('--step_size', type=float, default=0.01)
    parser.add_argument('--batch_size', type=int, default=10000)
    parser.add_argument('--use_static_car', action='store_true')
    parser.add_argument('--use_init_heuristic', action='store_true')
    args = parser.parse_args()

    with open(args.env_fname) as yaml_string:
        env_config = yaml.load(yaml_string)
        if issubclass(env_config['class'], envs.RosEnv):
            import rospy
            rospy.init_node("generate_data")
        env = from_config(env_config)

    if args.use_static_car:
        env.car_env.speed_offset_space.low = \
        env.car_env.speed_offset_space.high = np.array([0.0, 4.0])

    # transformers
    with open(args.transformers_fname) as transformers_file:
        transformers_config = yaml.load(transformers_file)
    transformers = dict()
    for data_name, transformer_config in transformers_config.items():
        if data_name == 'action':
            replace_config = {'space': env.action_space}
        elif data_name in env.observation_space.spaces:
            replace_config = {'space': env.observation_space.spaces[data_name]}
        else:
            replace_config = {}
        transformers[data_name] = from_config(transformers_config[data_name],
                                              replace_config=replace_config)

    env = ServoingEnv(env)
    env = RllabEnv(env, transformers=transformers)
    env = normalize(env)

    network_kwargs = dict(
        input_shape=env.observation_space.shape,
        output_dim=env.action_space.flat_dim,
        conv_filters=args.conv_filters,
        conv_filter_sizes=[3] * len(args.conv_filters),
        conv_strides=[2] * len(args.conv_filters),
        conv_pads=[0] * len(args.conv_filters),
        hidden_sizes=args.hidden_sizes,
        hidden_nonlinearity=LN.rectify,
        output_nonlinearity=None,
        name="mean_network",
    )
    if args.mean_network_type == 'conv':
        mean_network = ConvNetwork(**network_kwargs)
    elif args.mean_network_type == 'siamese':
        mean_network = SiameseQuadraticErrorNetwork(**network_kwargs)
    else:
        raise NotImplementedError

    policy = GaussianConvPolicy(
        env_spec=env.spec,
        init_std=args.init_std,
        mean_network=mean_network,
    )
    if args.use_init_heuristic:
        W_var = policy.get_params()[0]
        W = W_var.get_value()
        W[:, 3:, :, :] = -W[:, :3, :, :]
        W_var.set_value(W)
    baseline = GaussianConvBaseline(
        env_spec=env.spec,
        regressor_args=dict(
            use_trust_region=True,
            step_size=args.step_size,
            normalize_inputs=True,
            normalize_outputs=True,
            hidden_sizes=args.hidden_sizes,
            conv_filters=args.conv_filters,
            conv_filter_sizes=[3] * len(args.conv_filters),
            conv_strides=[2] * len(args.conv_filters),
            conv_pads=[0] * len(args.conv_filters),
            batchsize=args.batch_size * 10,
        ))

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.batch_size,
        max_path_length=100,
        n_itr=args.n_itr,
        discount=0.9,
        step_size=args.step_size,
    )
    algo.train()
    import IPython as ipy
    ipy.embed()
Esempio n. 15
0
def doit(mode):
    from rllab.envs.box2d.cartpole_env import CartpoleEnv
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.baselines.zero_baseline import ZeroBaseline
    from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
    from rllab.envs.normalized_env import normalize
    import numpy as np
    import theano
    import theano.tensor as TT
    from lasagne.updates import adam

    # normalize() makes sure that the actions for the environment lies
    # within the range [-1, 1] (only works for environments with continuous actions)
    env = normalize(CartpoleEnv())
    # Initialize a neural network policy with a single hidden layer of 8 hidden units
    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,))
    # Initialize a linear baseline estimator using default hand-crafted features
    if "linbaseline" in mode:
        print('linear baseline')
        baseline = LinearFeatureBaseline(env.spec)
    elif "vanilla" in mode:
        print("zero baseline")
        baseline = ZeroBaseline(env.spec)
    elif mode == "batchavg":
        print('batch average baseline')
        # use a zero baseline but subtract the mean of the discounted returns (see below)
        baseline = ZeroBaseline(env.spec)

    if "_ztrans" in mode:
        print('z transform advantages')
    else:
        print('no z transform')


    # We will collect 100 trajectories per iteration
    N = 50
    # Each trajectory will have at most 100 time steps
    T = 50
    # Number of iterations
    n_itr = 50
    # Set the discount factor for the problem
    discount = 0.99
    # Learning rate for the gradient update
    learning_rate = 0.1

    # Construct the computation graph

    # Create a Theano variable for storing the observations
    # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However,
    # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data
    # type for the variable. For instance, for an environment with discrete observations, we might want to use integer
    # types if the observations are represented as one-hot vectors.
    observations_var = env.observation_space.new_tensor_variable(
        'observations',
        # It should have 1 extra dimension since we want to represent a list of observations
        extra_dims=1
    )
    actions_var = env.action_space.new_tensor_variable(
        'actions',
        extra_dims=1
    )
    advantages_var = TT.vector('advantages')

    # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
    # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
    dist_info_vars = policy.dist_info_sym(observations_var)

    # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
    # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
    # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
    # rllab.distributions.DiagonalGaussian
    dist = policy.distribution

    # Note that we negate the objective, since most optimizers assume a
    # minimization problem
    surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var)

    # Get the list of trainable parameters.
    params = policy.get_params(trainable=True)
    grads = theano.grad(surr, params)

    f_train = theano.function(
        inputs=[observations_var, actions_var, advantages_var],
        outputs=None,
        updates=adam(grads, params, learning_rate=learning_rate),
        allow_input_downcast=True
    )

    results = []
    for _ in range(n_itr):

        paths = []

        for _ in range(N):
            observations = []
            actions = []
            rewards = []

            observation = env.reset()

            for _ in range(T):
                # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains
                # sufficient statistics for the action distribution. It should at least contain entries that would be
                # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym().
                # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is
                # not needed.
                action, _ = policy.get_action(observation)
                # Recall that the last entry of the tuple stores diagnostic information about the environment. In our
                # case it is not needed.
                next_observation, reward, terminal, _ = env.step(action)
                observations.append(observation)
                actions.append(action)
                rewards.append(reward)
                observation = next_observation
                if terminal:
                    # Finish rollout if terminal state reached
                    break

            # We need to compute the empirical return for each time step along the
            # trajectory
            path = dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
            )
            path_baseline = baseline.predict(path)
            advantages = []
            returns = []
            return_so_far = 0
            for t in range(len(rewards) - 1, -1, -1):
                return_so_far = rewards[t] + discount * return_so_far
                returns.append(return_so_far)
                advantage = return_so_far - path_baseline[t]
                advantages.append(advantage)
            # The advantages are stored backwards in time, so we need to revert it
            advantages = np.array(advantages[::-1])
            # And we need to do the same thing for the list of returns
            returns = np.array(returns[::-1])

            if "_ztrans" in mode:
                advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)


            path["advantages"] = advantages
            path["returns"] = returns

            paths.append(path)

        baseline.fit(paths)

        observations = np.concatenate([p["observations"] for p in paths])
        actions = np.concatenate([p["actions"] for p in paths])
        advantages = np.concatenate([p["advantages"] for p in paths])


        if mode == 'batchavg':
            # in this case `advantages` up to here are just our good old returns, without baseline or z transformation.
            # now we subtract their mean across all episodes.
            advantages = advantages - np.mean(advantages)


        f_train(observations, actions, advantages)
        avgr =  np.mean([sum(p["rewards"]) for p in paths])
        print(('Average Return:',avgr))
        results.append(avgr)
    return results
Esempio n. 16
0
                                    rd.seed(seed)

                                    ###
                                    seed %= 4294967294
                                    global seed_
                                    seed_ = seed
                                    rd.seed(seed)
                                    np.random.seed(seed)
                                    try:
                                        import tensorflow as tf

                                        tf.set_random_seed(seed)
                                    except Exception as e:
                                        print(e)
                                    print('using seed %s' % (str(seed)))
                                    env = TfEnv(normalize(PointEnvRandGoal()))
                                    policy = MAMLGaussianMLPPolicy(
                                        name="policy",
                                        env_spec=env.spec,
                                        grad_step_size=fast_learning_rate,
                                        hidden_nonlinearity=tf.nn.relu,
                                        hidden_sizes=(100, 100),
                                        std_modifier=pre_std_modifier,
                                    )
                                    if bas == 'zero':
                                        baseline = ZeroBaseline(env_spec=env.spec)
                                    elif 'linear' in bas:
                                        baseline = LinearFeatureBaseline(env_spec=env.spec)
                                    else:
                                        baseline = GaussianMLPBaseline(env_spec=env.spec)
                                    #expert_policy = PointEnvExpertPolicy(env_spec=env.spec)
Esempio n. 17
0
]

other_env_class_map = {"Cartpole": CartpoleEnv}

if args.env in supported_gym_envs:
    gymenv = GymEnv(args.env,
                    force_reset=True,
                    record_video=False,
                    record_log=False)
    # gymenv.env.seed(1)
else:
    gymenv = other_env_class_map[args.env]()

#TODO: assert continuous space

env = TfEnv(normalize(gymenv))

policy = DeterministicMLPPolicy(
    env_spec=env.spec,
    name="policy",
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(100, 50, 25),
    hidden_nonlinearity=tf.nn.relu,
)

es = OUStrategy(env_spec=env.spec)

qf = ContinuousMLPQFunction(
    env_spec=env.spec,
    hidden_sizes=(100, 100),
    hidden_nonlinearity=tf.nn.relu,
Esempio n. 18
0
step_sizes = [0.5, 0.5, 0.5,0.0, 0.5]
initial_params_files = [initial_params_file1, initial_params_file3, None,initial_params_file4]
gen_name = 'icml_point_results_'
names = ['maml','maml0','random','oracle']

exp_names = [gen_name + name for name in names]

all_avg_returns = []
for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files):
    avg_returns = []
    for goal in goals:
        goal = list(goal)


        if initial_params_file is not None and 'oracle' in initial_params_file:
            env = normalize(PointEnvRandGoalOracle(goal=goal))
            n_itr = 1
        else:
            env = normalize(PointEnvRandGoal(goal=goal))
            n_itr = 5
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_sizes=(100, 100),
        )


        if initial_params_file is not None:
            policy = None
Esempio n. 19
0

from sandbox.rocky.tf.algos.pg_stein import PGStein
from rllab.envs.box2d.double_pendulum_env import DoublePendulumEnv
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.normalized_env import normalize
from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv
from rllab.misc.instrument import stub, run_experiment_lite

stub(globals())

env = TfEnv(normalize(DoublePendulumEnv()))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    hidden_sizes=(100, 50, 25),
    adaptive_std=True,
    std_hidden_sizes=(100,25),
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = PGStein(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=10000,
    max_path_length=500,
    n_itr=100,
import os.path as osp

PROJECT_PATH = osp.abspath(osp.dirname(__file__))

#hyper paramerters
num_of_generations = 201
num_of_steps = 10000

now = datetime.datetime.now(dateutil.tz.tzlocal())
timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')

is_render = False

# config file
config_path = 'config/asteroids'
env = normalize(normalize(GymEnv("Asteroids-ramNoFrameskip-v0")))

policy = PowerGradientPolicy(
    env_spec=env.spec,
    neat_output_dim=(64, ),
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(64, 32))
# Load policy parameters = weights and bias of pretrained network
policy.load_policy('policy_parameters/model-asteroids.npz')


def do_rollout(agent, render=False):
    rewards = []
    for i in range(10):
        ob = env.reset()
        t = 0
Esempio n. 21
0
from rllab.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.gym_env import GymEnv
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import stub, run_experiment_lite
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy

from rllab.exploration_strategies.ou_strategy import OUStrategy
from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
from rllab.algos.ddpg import DDPG

stub(globals())

env = normalize(GymEnv("Quad-v0"))

use_trpo = True

if use_trpo:
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(100, 50, 25),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
rand_step_test_rew_summary = data['rand_step_test']
adv_test_rew_summary = data['adv_test']
ne = data['exp_save']
ni = data['iter_save']

save_prefix = 'BASELINE-env-{}_{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}'.format(
    env_name, adv_name, n_exps, n_itr, batch_size, adv_fraction, step_size,
    gae_lambda)
save_dir = os.environ['HOME'] + '/btpstuff/rllab-adv/results/baselines'
fig_dir = 'figs'
save_name = save_dir + '/' + save_prefix + '.p'
fig_name = fig_dir + '/' + save_prefix + '.png'

while ne < n_exps:
    ## Environment definition ##
    env = normalize(GymEnv(env_name, adv_fraction))
    ## Protagonist policy definition ##
    pro_policy = GaussianMLPPolicy(env_spec=env.spec,
                                   hidden_sizes=layer_size,
                                   is_protagonist=True)
    pro_baseline = LinearFeatureBaseline(env_spec=env.spec)

    ## Zero Adversary for the protagonist training ##
    zero_adv_policy = ConstantControlPolicy(env_spec=env.spec,
                                            is_protagonist=False,
                                            constant_val=0.0)

    ## Optimizer for the Protagonist ##
    from rllab.sampler import parallel_sampler
    parallel_sampler.initialize(n_process)
    if adv_name == 'no_adv':
from envs.bullet.cartpole_bullet import CartPoleBulletEnv
from rllab.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.gym_env import GymEnv
from rllab.envs.normalized_env import normalize
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy

env = normalize(GymEnv("CartPoleBulletEnv-v0"))

policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(8,)
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=5000,
    max_path_length=env.horizon,
    n_itr=50,
    discount=0.999,
    step_size=0.01,
    # Uncomment both lines (this and the plot parameter below) to enable plotting
#    plot=True,
)

algo.train()
Esempio n. 24
0
from rllab.algos.ddpg_polyRL import DDPG
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import run_experiment_lite
from rllab.exploration_strategies.ou_strategy import OUStrategy
from rllab.exploration_strategies.persistence_length_2D_v2 import Persistence_Length_Exploration
from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction

from rllab.envs.mujoco.swimmer_env import SwimmerEnv

env = normalize(SwimmerEnv())


def run_task(*_):
    """
    DPG on Swimmer environment
    """
    env = normalize(SwimmerEnv())
    """
    Initialise the policy as a neural network policy
    """
    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))
    """
    Defining exploration strategy : OUStrategy - 
    """
    """
    This strategy implements the Ornstein-Uhlenbeck process, which adds
    time-correlated noise to the actions taken by the deterministic policy.
Esempio n. 25
0
fast_learning_rates = [0.1]
baselines = ['linear']
fast_batch_size = 20
meta_batch_size = 60
max_path_length = 10
num_grad_updates = 1
meta_step_size = 0.01

use_maml = True

for fast_learning_rate in fast_learning_rates:
    for bas in baselines:
        stub(globals())

        env = TfEnv(normalize(GridWorldEnvRand('four-state')))
        policy = MAMLCategoricalMLPPolicy(
            name="policy",
            env_spec=env.spec,
            grad_step_size=fast_learning_rate,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100,100),
        )
        if bas == 'zero':
            baseline = ZeroBaseline(env_spec=env.spec)
        elif 'linear' in bas:
            baseline = LinearFeatureBaseline(env_spec=env.spec)
        else:
            baseline = GaussianMLPBaseline(env_spec=env.spec)
        algo = MAMLTRPO(
            env=env,
def run_experiment(variant):
    env_params = variant['env_params']
    policy_params = variant['policy_params']
    value_fn_params = variant['value_fn_params']
    algorithm_params = variant['algorithm_params']
    replay_buffer_params = variant['replay_buffer_params']
    sampler_params = variant['sampler_params']

    task = variant['task']
    domain = variant['domain']

    env = normalize(ENVIRONMENTS[domain][task](**env_params))

    pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params)

    sampler = SimpleSampler(**sampler_params)

    base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler)

    M = value_fn_params['layer_size']
    qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1')
    qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2')
    vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M))

    initial_exploration_policy = UniformPolicy(env_spec=env.spec)

    if policy_params['type'] == 'gaussian':
        policy = GaussianPolicy(
            env_spec=env.spec,
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            reg=1e-3,
        )
    elif policy_params['type'] == 'lsp':
        nonlinearity = {
            None: None,
            'relu': tf.nn.relu,
            'tanh': tf.nn.tanh
        }[policy_params['preprocessing_output_nonlinearity']]

        preprocessing_hidden_sizes = policy_params.get(
            'preprocessing_hidden_sizes')
        if preprocessing_hidden_sizes is not None:
            observations_preprocessor = MLPPreprocessor(
                env_spec=env.spec,
                layer_sizes=preprocessing_hidden_sizes,
                output_nonlinearity=nonlinearity)
        else:
            observations_preprocessor = None

        policy_s_t_layers = policy_params['s_t_layers']
        policy_s_t_units = policy_params['s_t_units']
        s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers

        bijector_config = {
            'num_coupling_layers': policy_params['coupling_layers'],
            'translation_hidden_sizes': s_t_hidden_sizes,
            'scale_hidden_sizes': s_t_hidden_sizes,
        }

        policy = LatentSpacePolicy(
            env_spec=env.spec,
            squash=policy_params['squash'],
            bijector_config=bijector_config,
            reparameterize=policy_params['reparameterize'],
            q_function=qf1,
            observations_preprocessor=observations_preprocessor)
    elif policy_params['type'] == 'gmm':
        # reparameterize should always be False if using a GMMPolicy
        policy = GMMPolicy(
            env_spec=env.spec,
            K=policy_params['K'],
            hidden_layer_sizes=(M, M),
            reparameterize=policy_params['reparameterize'],
            qf=qf1,
            reg=1e-3,
        )
    else:
        raise NotImplementedError(policy_params['type'])

    algorithm = SAC(
        base_kwargs=base_kwargs,
        env=env,
        policy=policy,
        initial_exploration_policy=initial_exploration_policy,
        pool=pool,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        lr=algorithm_params['lr'],
        scale_reward=algorithm_params['scale_reward'],
        discount=algorithm_params['discount'],
        tau=algorithm_params['tau'],
        reparameterize=algorithm_params['reparameterize'],
        target_update_interval=algorithm_params['target_update_interval'],
        action_prior=policy_params['action_prior'],
        save_full_state=False,
    )

    algorithm._sess.run(tf.global_variables_initializer())

    algorithm.train()
Esempio n. 27
0


from sandbox.rocky.tf.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.envs.normalized_env import normalize
from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import FiniteDifferenceHvp
from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv
from rllab.misc.instrument import stub, run_experiment_lite

stub(globals())

env = TfEnv(normalize(CartpoleEnv()))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32)
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
Esempio n. 28
0
def run_FaReLI(input_feed=None):
    beta_adam_steps_list = [(1,50)]
    # beta_curve = [250,250,250,250,250,5,5,5,5,1,1,1,1,] # make sure to check maml_experiment_vars
    # beta_curve = [1000] # make sure to check maml_experiment_vars
    adam_curve = [250,249,248,247,245,50,50,10] # make sure to check maml_experiment_vars
    # adam_curve = None

    fast_learning_rates = [1.0]
    baselines = ['linear',]  # linear GaussianMLP MAMLGaussianMLP zero
    env_option = ''
    # mode = "ec2"
    mode = "local"
    extra_input = "onehot_exploration" # "onehot_exploration" "gaussian_exploration"
    # extra_input = None
    extra_input_dim = 5
    # extra_input_dim = None
    goals_suffixes = ["_200_40_1"] #,"_200_40_2", "_200_40_3","_200_40_4"]
    # goals_suffixes = ["_1000_40"]

    fast_batch_size_list = [20]  # 20 # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]  #inner grad update size
    meta_batch_size_list = [40]  # 40 @ 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
    max_path_length = 100  # 100
    num_grad_updates = 1
    meta_step_size = 0.01
    pre_std_modifier_list = [1.0]
    post_std_modifier_train_list = [0.00001]
    post_std_modifier_test_list = [0.00001]
    l2loss_std_mult_list = [1.0]
    importance_sampling_modifier_list = ['']  #'', 'clip0.5_'
    limit_demos_num_list = [1]  # 40
    test_goals_mult = 1
    bas_lr = 0.01 # baseline learning rate
    momentum=0.5
    bas_hnl = tf.nn.relu
    baslayers_list = [(32,32), ]

    basas = 60 # baseline adam steps
    use_corr_term = True
    seeds = [1] #,2,3,4,5]
    envseeds = [6]
    use_maml = True
    test_on_training_goals = False
    for goals_suffix in goals_suffixes:
        for envseed in envseeds:
            for seed in seeds:
                for baslayers in baslayers_list:
                    for fast_batch_size in fast_batch_size_list:
                        for meta_batch_size in meta_batch_size_list:
                            for ism in importance_sampling_modifier_list:
                                for limit_demos_num in limit_demos_num_list:
                                    for l2loss_std_mult in l2loss_std_mult_list:
                                        for post_std_modifier_train in post_std_modifier_train_list:
                                            for post_std_modifier_test in post_std_modifier_test_list:
                                                for pre_std_modifier in pre_std_modifier_list:
                                                    for fast_learning_rate in fast_learning_rates:
                                                        for beta_steps, adam_steps in beta_adam_steps_list:
                                                            for bas in baselines:
                                                                stub(globals())
                                                                tf.set_random_seed(seed)
                                                                np.random.seed(seed)
                                                                rd.seed(seed)
                                                                env = TfEnv(normalize(Reacher7DofMultitaskEnv(envseed=envseed)))
                                                                exp_name = str(
                                                                    'R7_IL'
                                                                    # +time.strftime("%D").replace("/", "")[0:4]
                                                                    + goals_suffix + "_"
                                                                    + str(seed)
                                                                    # + str(envseed)
                                                                    + ("" if use_corr_term else "nocorr")
                                                                    # + str(int(use_maml))
                                                                    + ('_fbs' + str(fast_batch_size) if fast_batch_size!=20 else "")
                                                                    + ('_mbs' + str(meta_batch_size) if meta_batch_size!=40 else "")
                                                                    + ('_flr' + str(fast_learning_rate) if fast_learning_rate!=1.0 else "")
                                                                    + '_dem' + str(limit_demos_num)
                                                                    + ('_ei' + str(extra_input_dim) if type(
                                                                        extra_input_dim) == int else "")
                                                                    # + '_tgm' + str(test_goals_mult)
                                                                    #     +'metalr_'+str(meta_step_size)
                                                                    #     +'_ngrad'+str(num_grad_updates)
                                                                    + ("_bs" + str(beta_steps) if beta_steps != 1 else "")
                                                                    + "_as" + str(adam_steps)
                                                                    # +"_net" + str(net_size[0])
                                                                    # +"_L2m" + str(l2loss_std_mult)
                                                                    + ("_prsm" + str(
                                                                        pre_std_modifier) if pre_std_modifier != 1 else "")
                                                                    # + "_pstr" + str(post_std_modifier_train)
                                                                    # + "_posm" + str(post_std_modifier_test)
                                                                    #  + "_l2m" + str(l2loss_std_mult)
                                                                    + ("_" + ism if len(ism) > 0 else "")
                                                                    + "_bas" + bas[0]
                                                                    # +"_tfbe" # TF backend for baseline
                                                                    # +"_qdo" # quad dist optimizer
                                                                    + (("_bi" if bas_hnl == tf.identity else (
                                                                        "_brel" if bas_hnl == tf.nn.relu else "_bth"))  # identity or relu or tanh for baseline
                                                                       # + "_" + str(baslayers)  # size
                                                                       + "_baslr" + str(bas_lr)
                                                                       + "_basas" + str(basas) if bas[0] in ["G",
                                                                                                             "M"] else "")  # baseline adam steps
                                                                    + ("r" if test_on_training_goals else "")
                                                                    + "_" + time.strftime("%d%m_%H_%M"))



                                                                policy = MAMLGaussianMLPPolicy(
                                                                    name="policy",
                                                                    env_spec=env.spec,
                                                                    grad_step_size=fast_learning_rate,
                                                                    hidden_nonlinearity=tf.nn.relu,
                                                                    hidden_sizes=(100, 100),
                                                                    std_modifier=pre_std_modifier,
                                                                    # metalearn_baseline=(bas == "MAMLGaussianMLP"),
                                                                    extra_input_dim=(0 if extra_input is None else extra_input_dim),
                                                                )
                                                                if bas == 'zero':
                                                                    baseline = ZeroBaseline(env_spec=env.spec)
                                                                elif bas == 'MAMLGaussianMLP':
                                                                    baseline = MAMLGaussianMLPBaseline(env_spec=env.spec,
                                                                                                       learning_rate=bas_lr,
                                                                                                       hidden_sizes=baslayers,
                                                                                                       hidden_nonlinearity=bas_hnl,
                                                                                                       repeat=basas,
                                                                                                       repeat_sym=basas,
                                                                                                       momentum=momentum,
                                                                                                       extra_input_dim=( 0 if extra_input is None else extra_input_dim),

                                                                                                       # learn_std=False,
                                                                                                       # use_trust_region=False,
                                                                                                       # optimizer=QuadDistExpertOptimizer(
                                                                                                       #      name="bas_optimizer",
                                                                                                       #     #  tf_optimizer_cls=tf.train.GradientDescentOptimizer,
                                                                                                       #     #  tf_optimizer_args=dict(
                                                                                                       #     #      learning_rate=bas_lr,
                                                                                                       #     #  ),
                                                                                                       #     # # tf_optimizer_cls=tf.train.AdamOptimizer,
                                                                                                       #     # max_epochs=200,
                                                                                                       #     # batch_size=None,
                                                                                                       #      adam_steps=basas
                                                                                                       #     )
                                                                                                       )

                                                                elif bas == 'linear':
                                                                    baseline = LinearFeatureBaseline(env_spec=env.spec)
                                                                elif "GaussianMLP" in bas:
                                                                    baseline = GaussianMLPBaseline(env_spec=env.spec,
                                                                                                   regressor_args=dict(
                                                                                                       hidden_sizes=baslayers,
                                                                                                       hidden_nonlinearity=bas_hnl,
                                                                                                       learn_std=False,
                                                                                                       # use_trust_region=False,
                                                                                                       # normalize_inputs=False,
                                                                                                       # normalize_outputs=False,
                                                                                                       optimizer=QuadDistExpertOptimizer(
                                                                                                           name="bas_optimizer",
                                                                                                           #  tf_optimizer_cls=tf.train.GradientDescentOptimizer,
                                                                                                           #  tf_optimizer_args=dict(
                                                                                                           #      learning_rate=bas_lr,
                                                                                                           #  ),
                                                                                                           # # tf_optimizer_cls=tf.train.AdamOptimizer,
                                                                                                           # max_epochs=200,
                                                                                                           # batch_size=None,
                                                                                                           adam_steps=basas,
                                                                                                           use_momentum_optimizer=True,
                                                                                                       )))
                                                                algo = MAMLIL(
                                                                    env=env,
                                                                    policy=policy,
                                                                    baseline=baseline,
                                                                    batch_size=fast_batch_size,  # number of trajs for alpha grad update
                                                                    max_path_length=max_path_length,
                                                                    meta_batch_size=meta_batch_size,  # number of tasks sampled for beta grad update
                                                                    num_grad_updates=num_grad_updates,  # number of alpha grad updates
                                                                    n_itr=800, #100
                                                                    make_video=True,
                                                                    use_maml=use_maml,
                                                                    use_pooled_goals=True,
                                                                    use_corr_term=use_corr_term,
                                                                    test_on_training_goals=test_on_training_goals,
                                                                    metalearn_baseline=(bas=="MAMLGaussianMLP"),
                                                                    # metalearn_baseline=False,
                                                                    limit_demos_num=limit_demos_num,
                                                                    test_goals_mult=test_goals_mult,
                                                                    step_size=meta_step_size,
                                                                    plot=False,
                                                                    beta_steps=beta_steps,
                                                                    adam_curve=adam_curve,
                                                                    adam_steps=adam_steps,
                                                                    pre_std_modifier=pre_std_modifier,
                                                                    l2loss_std_mult=l2loss_std_mult,
                                                                    importance_sampling_modifier=MOD_FUNC[ism],
                                                                    post_std_modifier_train=post_std_modifier_train,
                                                                    post_std_modifier_test=post_std_modifier_test,
                                                                    expert_trajs_dir=EXPERT_TRAJ_LOCATION_DICT[env_option+"."+mode+goals_suffix+("_"+str(extra_input_dim) if type(extra_input_dim) == int else "")],
                                                                    expert_trajs_suffix=("_"+str(extra_input_dim) if type(extra_input_dim) == int else ""),
                                                                    seed=seed,
                                                                    extra_input=extra_input,
                                                                    extra_input_dim=(0 if extra_input is None else extra_input_dim),
                                                                    input_feed=input_feed,
                                                                    run_on_pr2=False,

                                                                )
                                                                run_experiment_lite(
                                                                    algo.train(),
                                                                    n_parallel=1,
                                                                    snapshot_mode="last",
                                                                    python_command='python3',
                                                                    seed=seed,
                                                                    exp_prefix=str('R7_IL_'
                                                                                   +time.strftime("%D").replace("/", "")[0:4]),
                                                                    exp_name=exp_name,
                                                                    plot=False,
                                                                    sync_s3_pkl=True,
                                                                    mode=mode,
                                                                    terminate_machine=True,
                                                                )
Esempio n. 29
0
gen_name = 'icml_ant_results_'
names = ['maml','pretrain','random', 'oracle']
exp_names = [gen_name + name for name in names]

step_sizes = [0.1, 0.2, 1.0, 0.0]
initial_params_files = [file1, file2, None, file3]


all_avg_returns = []
for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files):
    avg_returns = []

    for goal in goals:

        if initial_params_file is not None and 'oracle' in initial_params_file:
            env = normalize(AntEnvOracle())
            n_itr = 1
        else:
            env = normalize(AntEnvRand())
            n_itr = 4
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
        )

        if initial_params_file is not None:
            policy = None
Esempio n. 30
0
from sandbox.rocky.tf.algos.vpg import VPG
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.envs.normalized_env import normalize
from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
from sandbox.rocky.tf.envs.base import TfEnv
from rllab.misc.instrument import stub, run_experiment_lite

stub(globals())

env = TfEnv(normalize(CartpoleEnv()))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = VPG(env=env,
           policy=policy,
           baseline=baseline,
           batch_size=10000,
           max_path_length=100,
           n_itr=40,
           discount=0.99,
           optimizer_args=dict(tf_optimizer_args=dict(learning_rate=0.01, )))
run_experiment_lite(
    algo.train(),
    n_parallel=2,
Esempio n. 31
0
    return res


def dis_iw(iw):
    z = list()
    t = 1
    for y in iw:
        z.append(y * t)
        t *= discount
    return np.array(z)


load_policy = True
# normalize() makes sure that the actions for the environment lies
# within the range [-1, 1] (only works for environments with continuous actions)
env = normalize(GymEnv("Swimmer-v1"))
# Initialize a neural network policy with a single hidden layer of 8 hidden units
policy = GaussianMLPPolicy(env.spec, hidden_sizes=(32, 32))
snap_policy = GaussianMLPPolicy(env.spec, hidden_sizes=(32, 32))
back_up_policy = GaussianMLPPolicy(env.spec, hidden_sizes=(32, 32))
parallel_sampler.populate_task(env, policy)

# policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
# distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
# the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
# rllab.distributions.DiagonalGaussian
dist = policy.distribution
snap_dist = snap_policy.distribution
# We will collect 100 trajectories per iteration
N = 100
# Each trajectory will have at most 100 time steps
Esempio n. 32
0
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])
    sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res']

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report and plot_policy_reward...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    tf_session = tf.Session()

    inner_env = normalize(AntEnv())

    uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'],
                                                   bounds=v['goal_range'],
                                                   center=v['goal_center'])
    env = GoalExplorationEnv(
        env=inner_env,
        goal_generator=uniform_goal_generator,
        obs2goal_transform=lambda x: x[-3:-1],
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        append_transformed_obs=v['append_transformed_obs'],
        append_extra_info=v['append_extra_info'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    if v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    # initialize all logging arrays on itr0
    outer_iter = 0
    logger.log('Generating the Initial Heatmap...')
    test_and_plot_policy(policy,
                         env,
                         max_reward=v['max_reward'],
                         sampling_res=sampling_res,
                         n_traj=v['n_traj'],
                         itr=outer_iter,
                         report=report,
                         limit=v['goal_range'],
                         center=v['goal_center'],
                         bounds=v['goal_range'])

    # GAN
    logger.log("Instantiating the GAN...")
    gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key}
    for key, value in gan_configs.items():
        if value is tf.train.AdamOptimizer:
            gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key +
                                                                  '_stepSize'])
        if value is tflearn.initializations.truncated_normal:
            gan_configs[key] = tflearn.initializations.truncated_normal(
                stddev=gan_configs[key + '_stddev'])

    gan = StateGAN(
        state_size=v['goal_size'],
        evaluater_size=v['num_labels'],
        state_range=v['goal_range'],
        state_center=v['goal_center'],
        state_noise_level=v['goal_noise_level'],
        generator_layers=v['gan_generator_layers'],
        discriminator_layers=v['gan_discriminator_layers'],
        noise_size=v['gan_noise_size'],
        tf_session=tf_session,
        configs=gan_configs,
    )

    # log first samples form the GAN
    initial_goals, _ = gan.sample_states_with_noise(v['num_new_goals'])

    logger.log("Labeling the goals")
    labels = label_states(initial_goals,
                          env,
                          policy,
                          v['horizon'],
                          n_traj=v['n_traj'],
                          key='goal_reached')

    plot_labeled_states(initial_goals,
                        labels,
                        report=report,
                        itr=outer_iter,
                        limit=v['goal_range'],
                        center=v['goal_center'])
    report.new_row()

    all_goals = StateCollection(distance_threshold=v['coll_eps'])

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        feasible_goals = generate_initial_goals(env,
                                                policy,
                                                v['goal_range'],
                                                goal_center=v['goal_center'],
                                                horizon=v['horizon'])
        labels = np.ones((feasible_goals.shape[0],
                          2)).astype(np.float32)  # make them all good goals
        plot_labeled_states(feasible_goals,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'],
                            summary_string_base='On-policy Goals:\n')
        if v['only_on_policy']:
            goals = feasible_goals[np.random.choice(
                feasible_goals.shape[0], v['num_new_goals'], replace=False), :]
        else:
            logger.log("Training the GAN")
            gan.pretrain(feasible_goals, v['gan_outer_iters'])
            # Sample GAN
            logger.log("Sampling goals from the GAN")
            raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals'])

            if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0:
                old_goals = all_goals.sample(v['num_old_goals'])
                goals = np.vstack([raw_goals, old_goals])
            else:
                goals = raw_goals

        with ExperimentLogger(log_dir,
                              'last',
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment goal generator")
            env.update_goal_generator(
                UniformListStateGenerator(
                    goals.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))

            logger.log("Training the algorithm")
            algo = TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=v['pg_batch_size'],
                max_path_length=v['horizon'],
                n_itr=v['inner_iters'],
                step_size=0.01,
                plot=False,
            )

            trpo_paths = algo.train()

        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [goals, labels] = label_states_from_paths(
                trpo_paths,
                n_traj=2,
                key='goal_reached',  # using the min n_traj
                as_goal=True,
                env=env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(goals,
                                         env,
                                         policy,
                                         v['horizon'],
                                         as_goals=True,
                                         n_traj=v['n_traj'],
                                         key='goal_reached',
                                         full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            env.log_diagnostics(paths)

        logger.log('Generating the Heatmap...')
        test_and_plot_policy(policy,
                             env,
                             max_reward=v['max_reward'],
                             sampling_res=sampling_res,
                             n_traj=v['n_traj'],
                             itr=outer_iter,
                             report=report,
                             limit=v['goal_range'],
                             center=v['goal_center'],
                             bounds=v['goal_range'])

        plot_labeled_states(goals,
                            labels,
                            report=report,
                            itr=outer_iter,
                            limit=v['goal_range'],
                            center=v['goal_center'])

        logger.dump_tabular(with_prefix=False)
        report.new_row()

        # append new goals to list of all goals (replay buffer): Not the low reward ones!!
        filtered_raw_goals = [
            goal for goal, label in zip(goals, labels) if label[0] == 1
        ]  # this is not used if no replay buffer
        all_goals.append(filtered_raw_goals)

        if v['add_on_policy']:
            logger.log("sampling on policy")
            feasible_goals = generate_initial_goals(
                env,
                policy,
                v['goal_range'],
                goal_center=v['goal_center'],
                horizon=v['horizon'])
            # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:]
            all_goals.append(feasible_goals)
Esempio n. 33
0
from rllab.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.mujoco.swimmer_env import SwimmerEnv
from rllab.envs.normalized_env import normalize
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy

env = normalize(SwimmerEnv())

policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32)
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=500,
    n_itr=40,
    discount=0.99,
    step_size=0.01,
)
algo.train()
Esempio n. 34
0
meta_iter = FLAGS.meta_iter
meta_method = FLAGS.meta_method
direc = FLAGS.direc
mode = FLAGS.mode
load_policy = FLAGS.load_policy

# option
max_path_length = 200
num_grad_updates = 1
num_leader_grad_updates = 2

stub(globals())

# task type    
if direc:
    env = TfEnv(normalize(HalfCheetahEnvRandDirec()))
else:
    env = TfEnv(normalize(HalfCheetahEnvRand()))
direc_str = 'direc' if direc else ''

# svpg str
if svpg:
    svpg_str = '_SVPG' + '_alpha' + str(svpg_alpha)
else:
    svpg_str = '_VPG'

# bmaml|emaml
if svpg == False:
    maml_type = 'emaml'
else:
    maml_type = 'bmaml'
Esempio n. 35
0
def create_env_rllab(env, seed):
    env_name = re.match('rllab.(\S+)', env).group(1)
    env_rllab_class = rllab_env_from_name(env_name)
    env = normalize(env_rllab_class())
    return env
from rllab.algos.spg_ddpg_unified import SPG_DDPG
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import run_experiment_lite
from rllab.exploration_strategies.ou_strategy import OUStrategy
from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy

from rllab.policies.stochastic_mlp_policy import GaussianMLPPolicy
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction

from rllab.envs.mujoco.hopper_env import HopperEnv

env = normalize(HopperEnv())


def run_task(*_):
    env = normalize(HopperEnv())

    # policy = DeterministicMLPPolicy(
    #     env_spec=env.spec,
    #     # The neural network policy should have two hidden layers, each with 32 hidden units.
    #     hidden_sizes=(32, 32)
    # )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32))

    es = OUStrategy(env_spec=env.spec)

    qf = ContinuousMLPQFunction(env_spec=env.spec)
Esempio n. 37
0


from rllab.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.gym_env import GymEnv
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import stub, run_experiment_lite
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy

stub(globals())

env = normalize(GymEnv("Pendulum-v0"))

policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(8, 8)
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=env.horizon,
    n_itr=50,
    discount=0.99,
    step_size=0.01,
Esempio n. 38
0
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.envs.normalized_env import normalize
import numpy as np
import theano
import theano.tensor as TT
from rllab.sampler import parallel_sampler
from lasagne.updates import sgd
from lasagne.updates import adam
from rllab.misc import ext
import pandas as pd

load_policy = True
# normalize() makes sure that the actions for the environment lies
# within the range [-1, 1] (only works for environments with continuous actions)
env = normalize(CartpoleEnv())
# Initialize a neural network policy with a single hidden layer of 8 hidden units
policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8, ), learn_std=False)
parallel_sampler.populate_task(env, policy)

# policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
# distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
# the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
# rllab.distributions.DiagonalGaussian
dist = policy.distribution
# We will collect 100 trajectories per iteration
N = 10
# Each trajectory will have at most 100 time steps
T = 100
# Number of iterations
n_itr = 1000
Esempio n. 39
0
parser = argparse.ArgumentParser(description='Train a policy')
parser.add_argument('-a', action="store", dest="alg")
parser.add_argument('-e', action="store", dest="env")
parsed = parser.parse_args()

stub(globals())

alg = "DDPG"

envs = {"Arm": ArmEnv,
        "Stand": StandEnv,
        "Gait": GaitEnv,
        "Crouch": CrouchEnv,
        "Hop": HopEnv}

env = normalize(envs[parsed.env](visualize=False))

# env = normalize(CartpoleEnv())
# env = normalize(GymEnv("Pendulum-v0", record_video=False, record_log=False))

if alg == "DDPG":
    qf = ContinuousMLPQFunction(
        env_spec=env.spec,
        hidden_sizes=(64, 64, 64)
    )

    es = OUStrategy(env_spec=env.spec, theta = 0.5)

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
# stub(globals())
#
# supported_envs = ["MountainCar-v0", "CartPole-v0"]
#
# if args.env not in supported_envs:
#     raise Exception("Env not supported! Try it out though?")

# Need to wrap in a tf environment and force_reset to true
# see https://github.com/openai/rllab/issues/87#issuecomment-282519288

register_custom_envs()

gymenv = GymEnv(args.env, force_reset=True)
# gymenv.env.seed(124)
env = TfEnv(normalize(gymenv, normalize_obs=False))

if env.spec.action_space == 'Discrete':
    policy = CategoricalMLPPolicy(
    name="policy",
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(32, 32)
    )
else:
    policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    hidden_sizes=(100, 50, 25)
    )
Esempio n. 41
0
from __future__ import print_function
from __future__ import absolute_import

from rllab.algos.trpo import TRPO
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.gym_env import GymEnv
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import stub, run_experiment_lite
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy

stub(globals())

# env = normalize(GymEnv("Pendulum-v0", record_video=False))
env = normalize(GymEnv("VREP-v0", record_video=False))

policy = GaussianMLPPolicy(
    env_spec=env.spec,
    # The neural network policy should have two hidden layers, each with 32 hidden units.
    hidden_sizes=(128, 128)
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    # batch_size=4000,
    batch_size=1000,
    max_path_length=env.horizon,
    n_itr=500,
Esempio n. 42
0
step_sizes = [0.1, 0.2, 1.0, 0.0]
initial_params_files = [file1, file2, None, file3]

names = ['random']
exp_names = [gen_name + name for name in names]
initial_params_files = [None]
step_sizes = [0.5]

all_avg_returns = []
for step_i, initial_params_file in zip(range(len(step_sizes)),
                                       initial_params_files):
    avg_returns = []
    for goal_i, goal in zip(range(len(goals)), goals):

        if initial_params_file is not None and 'oracle' in initial_params_file:
            env = normalize(HalfCheetahEnvDirecOracle())
            n_itr = 1
        else:
            env = normalize(HalfCheetahEnvRandDirec())
            n_itr = 4
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
        )

        if initial_params_file is not None:
            policy = None
Esempio n. 43
0
def run_task(*_):
    v_enter = 30
    inner_length = 800
    long_length = 100
    short_length = 800
    n = 1
    m = 5
    num_cars_left = 3
    num_cars_right = 3
    num_cars_top = 15
    num_cars_bot = 15
    tot_cars = (num_cars_left + num_cars_right) * m \
        + (num_cars_bot + num_cars_top) * n

    grid_array = {
        "short_length": short_length,
        "inner_length": inner_length,
        "long_length": long_length,
        "row_num": n,
        "col_num": m,
        "cars_left": num_cars_left,
        "cars_right": num_cars_right,
        "cars_top": num_cars_top,
        "cars_bot": num_cars_bot
    }

    sumo_params = SumoParams(sim_step=1, sumo_binary="sumo-gui")

    vehicles = Vehicles()
    vehicles.add(veh_id="idm",
                 acceleration_controller=(SumoCarFollowingController, {}),
                 sumo_car_following_params=SumoCarFollowingParams(
                     minGap=2.5,
                     max_speed=v_enter,
                 ),
                 routing_controller=(GridRouter, {}),
                 num_vehicles=tot_cars,
                 speed_mode="all_checks")

    additional_env_params = {
        "target_velocity": 50,
        "num_steps": 500,
        "control-length": 150,
        "switch_time": 3.0
    }
    env_params = EnvParams(additional_params=additional_env_params)

    additional_net_params = {
        "speed_limit": 35,
        "grid_array": grid_array,
        "horizontal_lanes": 1,
        "vertical_lanes": 1,
        "traffic_lights": True
    }

    initial_config, net_params = get_non_flow_params(10, additional_net_params)

    scenario = SimpleGridScenario(name="grid-intersection",
                                  generator_class=SimpleGridGenerator,
                                  vehicles=vehicles,
                                  net_params=net_params,
                                  initial_config=initial_config)

    env_name = "GreenWaveEnv"
    pass_params = (env_name, sumo_params, vehicles, env_params, net_params,
                   initial_config, scenario)

    env = GymEnv(env_name, record_video=False, register_params=pass_params)
    horizon = env.horizon
    env = normalize(env)

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=40000,
        max_path_length=horizon,
        # whole_paths=True,
        n_itr=800,
        discount=0.999,
        # step_size=0.01,
    )
    algo.train()
Esempio n. 44
0
        """
        Returns a Space object
        """
        low = np.array(
            [0, -np.pi / 2, -np.pi / 2, 0, -np.pi, -np.pi, 0, -np.pi, -np.pi])
        high = np.array([
            100, np.pi / 2, np.pi / 2, 1000, np.pi, np.pi, 1000, np.pi, -np.pi
        ])
        return Box(low=low, high=high)

    def log_diagnostics(self, paths):
        pass


if __name__ == "__main__":
    from rllab.algos.trpo import TRPO
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.envs.normalized_env import normalize
    from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy

    env = normalize(FlightEnv())
    policy = GaussianMLPPolicy(env_spec=env.spec, )
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                max_path_length=400,
                batch_size=4000,
                gae_lambda=0.7)
    algo.train()
Esempio n. 45
0
parser.add_argument("--text_log_file", default="./data/debug.log", help="Where text output will go")
parser.add_argument("--tabular_log_file", default="./data/progress.csv", help="Where tabular output will go")
args = parser.parse_args()

# stub(globals())

# ext.set_seed(1)
logger.add_text_output(args.text_log_file)
logger.add_tabular_output(args.tabular_log_file)
logger.set_log_tabular_only(False)

envs = []

for env_name in args.envs:
    gymenv = GymEnv(env_name, force_reset=True, record_video=False, record_log=False)
    env = TfEnv(normalize(gymenv))
    envs.append((env_name, env))

policy = GaussianMLPPolicy(
name="policy",
env_spec=env.spec,
# The neural network policy should have two hidden layers, each with 32 hidden units.
hidden_sizes=(100, 50, 25),
hidden_nonlinearity=tf.nn.relu,
)

baseline = LinearFeatureBaseline(env_spec=env.spec)


with tf.Session() as sess:
    for env_name, env in envs:
Esempio n. 46
0
rand_test_rew_summary = []
step_test_rew_summary = []
rand_step_test_rew_summary = []
adv_test_rew_summary = []

## Preparing file to save results in ##
save_prefix = 'env-{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}_{}'.format(
    env_name, n_exps, n_itr, batch_size, adv_fraction, step_size, gae_lambda,
    random.randint(0, 1000000))
save_name = save_dir + '/' + save_prefix + '.p'

## Looping over experiments to carry out ##
for ne in range(n_exps):
    ## Environment definition ##
    ## The second argument in GymEnv defines the relative magnitude of adversary. For testing we set this to 1.0.
    env = normalize(GymEnv(env_name, adv_fraction))
    env_orig = normalize(GymEnv(env_name, 1.0))

    ## Protagonist policy definition ##
    pro_policy = GaussianMLPPolicy(env_spec=env.spec,
                                   hidden_sizes=layer_size,
                                   is_protagonist=True)
    pro_baseline = LinearFeatureBaseline(env_spec=env.spec)

    ## Zero Adversary for the protagonist training ##
    zero_adv_policy = ConstantControlPolicy(env_spec=env.spec,
                                            is_protagonist=False,
                                            constant_val=0.0)

    ## Adversary policy definition ##
    adv_policy = GaussianMLPPolicy(env_spec=env.spec,

# should also code up alternative KL thing

variants = VG().variants()

max_path_length = 200
num_grad_updates = 1
use_maml=True

for v in variants:
    direc = v['direc']
    learning_rate = v['meta_step_size']

    if direc:
        env = TfEnv(normalize(HalfCheetahEnvRandDirec()))
    else:
        env = TfEnv(normalize(HalfCheetahEnvRand()))
    policy = MAMLGaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        grad_step_size=v['fast_lr'],
        hidden_nonlinearity=tf.nn.relu,
        hidden_sizes=(100,100),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = MAMLTRPO(
        env=env,
        policy=policy,
        baseline=baseline,
Esempio n. 48
0
def train(num_experiments, thread_id, queue):

    ############ DEFAULT PARAMETERS ############

    env_name = None  #Name of adversarial environment
    path_length = 1000  #Maximum episode length
    layer_size = tuple([100, 100, 100])  #Layer definition
    ifRender = False  #Should we render?
    afterRender = 100  #After how many to animate
    n_exps = 1  #Number of training instances to run
    n_itr = 25  #Number of iterations of the alternating optimization
    n_pro_itr = 1  #Number of iterations for the protaginist
    n_adv_itr = 1  #Number of interations for the adversary
    batch_size = 4000  #Number of training samples for each iteration
    ifSave = True  #Should we save?
    save_every = 100  #Save checkpoint every save_every iterations
    n_process = 1  #Number of parallel threads for sampling environment
    adv_fraction = 0.25  #Fraction of maximum adversarial force to be applied
    step_size = 0.01  #kl step size for TRPO
    gae_lambda = 0.97  #gae_lambda for learner
    save_dir = './results'  #folder to save result in

    ############ ENV SPECIFIC PARAMETERS ############

    env_name = 'Walker2dAdv-v1'

    layer_size = tuple([64, 64])
    step_size = 0.1
    gae_lambda = 0.97
    batch_size = 25000

    n_exps = num_experiments
    n_itr = 500
    ifSave = False
    n_process = 4

    adv_fraction = 5.0
    adv_strengths = []
    for i in range(0, int(adv_fraction) + 1, 1):
        adv_strengths.append(i)

    save_dir = './../results/AdvWalker'

    args = [
        env_name, path_length, layer_size, ifRender, afterRender, n_exps,
        n_itr, n_pro_itr, n_adv_itr, batch_size, save_every, n_process,
        adv_fraction, step_size, gae_lambda, save_dir
    ]

    ############ ADVERSARIAL POLICY LOAD ############

    filepath = './../initial_results/Walker/env-Walker2dAdv-v1_Exp1_Itr1500_BS25000_Adv0.25_stp0.01_lam0.97_507500.p'
    res_D = pickle.load(open(filepath, 'rb'))
    pretrained_adv_policy = res_D['adv_policy']

    ############ MAIN LOOP ############

    ## Initializing summaries for the tests ##
    const_test_rew_summary = []
    rand_test_rew_summary = []
    step_test_rew_summary = []
    rand_step_test_rew_summary = []
    adv_test_rew_summary = []

    ## Preparing file to save results in ##
    save_prefix = 'static_env-{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}_{}'.format(
        env_name, n_exps, n_itr, batch_size, adv_fraction, step_size,
        gae_lambda, random.randint(0, 1000000))
    save_name = save_dir + '/' + save_prefix

    ## Looping over experiments to carry out ##
    for ne in range(n_exps):
        ## Environment definition ##
        ## The second argument in GymEnv defines the relative magnitude of adversary. For testing we set this to 1.0.
        env = normalize(GymEnv(env_name, adv_fraction))
        env_orig = normalize(GymEnv(env_name, 1.0))

        ## Protagonist policy definition ##
        pro_policy = GaussianMLPPolicy(env_spec=env.spec,
                                       hidden_sizes=layer_size,
                                       is_protagonist=True)
        pro_baseline = LinearFeatureBaseline(env_spec=env.spec)

        ## Zero Adversary for the protagonist training ##
        zero_adv_policy = ConstantControlPolicy(env_spec=env.spec,
                                                is_protagonist=False,
                                                constant_val=0.0)

        ## Adversary policy definition ##
        adv_policy = pretrained_adv_policy
        adv_baseline = LinearFeatureBaseline(env_spec=env.spec)

        ## Initializing the parallel sampler ##
        parallel_sampler.initialize(n_process)

        ## Setting up summaries for testing for a specific training instance ##
        pro_rews = []
        adv_rews = []
        all_rews = []
        const_testing_rews = []
        const_testing_rews.append(
            test_const_adv(env_orig, pro_policy, path_length=path_length))
        rand_testing_rews = []
        rand_testing_rews.append(
            test_rand_adv(env_orig, pro_policy, path_length=path_length))
        step_testing_rews = []
        step_testing_rews.append(
            test_step_adv(env_orig, pro_policy, path_length=path_length))
        rand_step_testing_rews = []
        rand_step_testing_rews.append(
            test_rand_step_adv(env_orig, pro_policy, path_length=path_length))
        adv_testing_rews = []
        adv_testing_rews.append(
            test_learnt_adv(env,
                            pro_policy,
                            adv_policy,
                            path_length=path_length))

        ## Loops through adversary strength levels
        n_loopsize = int(n_itr / len(adv_strengths))
        for adv_index, adv_strength in enumerate(adv_strengths):

            env = normalize(GymEnv(env_name, adv_strength))

            ## Optimizer for the Protagonist ##
            pro_algo = TRPO(env=env,
                            pro_policy=pro_policy,
                            adv_policy=adv_policy,
                            pro_baseline=pro_baseline,
                            adv_baseline=adv_baseline,
                            batch_size=batch_size,
                            max_path_length=path_length,
                            n_itr=n_pro_itr,
                            discount=0.995,
                            gae_lambda=gae_lambda,
                            step_size=step_size,
                            is_protagonist=True)

            logger.log(
                '\n\nAdversarial Level: {} Adversarial Strength: {}\n'.format(
                    adv_index, adv_strength))

            ## Beginning alternating optimization ##
            for ni in range(n_loopsize):
                logger.log(
                    '\n\nThread: {} Experiment: {} Iteration: {}\n'.format(
                        thread_id,
                        ne,
                        ni + n_loopsize * adv_index,
                    ))

                ## Train Protagonist
                pro_algo.train()
                pro_rews += pro_algo.rews
                all_rews += pro_algo.rews
                logger.log('Protag Reward: {}'.format(
                    np.array(pro_algo.rews).mean()))

                ## Test the learnt policies
                const_testing_rews.append(
                    test_const_adv(env, pro_policy, path_length=path_length))
                rand_testing_rews.append(
                    test_rand_adv(env, pro_policy, path_length=path_length))
                step_testing_rews.append(
                    test_step_adv(env, pro_policy, path_length=path_length))
                rand_step_testing_rews.append(
                    test_rand_step_adv(env,
                                       pro_policy,
                                       path_length=path_length))
                adv_testing_rews.append(
                    test_learnt_adv(env,
                                    pro_policy,
                                    adv_policy,
                                    path_length=path_length))

                if ni % afterRender == 0 and ifRender == True:
                    test_const_adv(env,
                                   pro_policy,
                                   path_length=path_length,
                                   n_traj=1,
                                   render=True)

                if ni != 0 and ni % save_every == 0 and ifSave == True:
                    ## SAVING CHECKPOINT INFO ##
                    pickle.dump(
                        {
                            'args': args,
                            'pro_policy': pro_policy,
                            'adv_policy': adv_policy,
                            'zero_test': [const_testing_rews],
                            'rand_test': [rand_testing_rews],
                            'step_test': [step_testing_rews],
                            'rand_step_test': [rand_step_testing_rews],
                            'iter_save': ni,
                            'exp_save': ne,
                            'adv_test': [adv_testing_rews]
                        },
                        open(
                            save_name + '_' +
                            str(ni + n_loopsize * adv_index) + '.p', 'wb'))

        ## Shutting down the optimizer ##
        pro_algo.shutdown_worker()

        ## Updating the test summaries over all training instances
        const_test_rew_summary.append(const_testing_rews)
        rand_test_rew_summary.append(rand_testing_rews)
        step_test_rew_summary.append(step_testing_rews)
        rand_step_test_rew_summary.append(rand_step_testing_rews)
        adv_test_rew_summary.append(adv_testing_rews)

    queue.put([
        const_test_rew_summary, rand_test_rew_summary, step_test_rew_summary,
        rand_step_test_rew_summary, adv_test_rew_summary
    ])

    ############ SAVING MODEL ############
    '''
class VG(VariantGenerator):

    @variant
    def step_size(self):
        return [0.01, 0.05, 0.1]

    @variant
    def seed(self):
        return [1, 11, 21, 31, 41]

variants = VG().variants()

for v in variants:

    env = TfEnv(normalize(GymEnv('HalfCheetah-v1', record_video=False, record_log=False)))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32),
        name="policy"
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
Esempio n. 50
0
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from ddpg import DDPG
from rllab.envs.box2d.cartpole_env import CartpoleEnv
from rllab.envs.normalized_env import normalize
from policies import DeterministicMLPPolicy
from qfuncs import ContinuousMLPQ
from strategies import OUStrategy
from utils import SEED
import mxnet as mx

# set environment, policy, qfunc, strategy

env = normalize(CartpoleEnv())

policy = DeterministicMLPPolicy(env.spec)
qfunc = ContinuousMLPQ(env.spec)
strategy = OUStrategy(env.spec)

# set the training algorithm and train

algo = DDPG(
    env=env,
    policy=policy,
    qfunc=qfunc,
    strategy=strategy,
    ctx=mx.gpu(0),
    max_path_length=100,
    epoch_length=1000,
gen_name = 'icml_antdirec_results_'
names = ['maml','pretrain','random', 'oracle']
step_sizes = [0.1, 0.2, 1.0, 0.0]
initial_params_files = [file1, file2, None, file3]

exp_names = [gen_name + name for name in names]

all_avg_returns = []
for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files):
    avg_returns = []
    for goal_i, goal in zip(range(len(goals)), goals):


        if initial_params_file is not None and 'oracle' in initial_params_file:
            env = normalize(AntEnvDirecOracle())
            n_itr = 1
        else:
            env = normalize(AntEnvRandDirec())
            n_itr = 4
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
        )

        if initial_params_file is not None:
            policy = None
Esempio n. 52
0
ab_l1 = dict(mode='ours',
             mode2='ab_l1',
             scale=0.01,
             modelname='model/pushreal_l1/ablation_pushreal_L1_30000')

seeds = [123]

for params in [real_params]:
    for nvar in range(10):
        randparams = params['rand']()
        for modeparams in [
                ab_l2
        ]:  #, ours_mode, ours_nofeat, ours_noimage, ab_l2l3, ab_l1]:
            copyparams = randparams.copy()
            copyparams.update(modeparams)
            mdp = normalize(GymEnv(params['env'], **copyparams))
            for seed in seeds:
                policy = GaussianMLPPolicy(env_spec=mdp.spec,
                                           hidden_sizes=(32, 32),
                                           init_std=10)

                baseline = LinearFeatureBaseline(mdp.spec, )

                batch_size = 50 * 250
                algo = TRPO(env=mdp,
                            policy=policy,
                            baseline=baseline,
                            batch_size=batch_size,
                            whole_paths=True,
                            max_path_length=50,
                            n_itr=100,
Esempio n. 53
0
fast_learning_rates = [0.5]
baselines = ['linear']
fast_batch_size = 20  # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2]
meta_batch_size = 40  # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable
max_path_length = 100
num_grad_updates = 1
meta_step_size = 0.01

use_maml = True

for fast_learning_rate in fast_learning_rates:
    for learning_rate in learning_rates:
        for bas in baselines:
            stub(globals())

            env = TfEnv(normalize(PointEnvRandGoal()))
            policy = MAMLGaussianMLPPolicy(
                name="policy",
                env_spec=env.spec,
                grad_step_size=fast_learning_rate,
                hidden_nonlinearity=tf.nn.relu,
                hidden_sizes=(100,100),
            )
            if bas == 'zero':
                baseline = ZeroBaseline(env_spec=env.spec)
            elif 'linear' in bas:
                baseline = LinearFeatureBaseline(env_spec=env.spec)
            else:
                baseline = GaussianMLPBaseline(env_spec=env.spec)
            algo = MAMLTRPO(
                env=env,
gen_name = 'icml_cheetah_results_'
names = ['maml','pretrain','random', 'oracle']
exp_names = [gen_name + name for name in names]

step_sizes = [0.1, 0.02, 0.1, 0.0]
initial_params_files = [file1]#, None, None, None

all_avg_returns = []
for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files):
    avg_returns = []
    for goal in goals:


        if initial_params_file is not None and 'oracle' in initial_params_file:
            env = normalize(HalfCheetahEnvOracle())
            n_itr = 1
        else:
            env = normalize(HalfCheetahEnvRandDisable())
            n_itr = 5
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_nonlinearity=tf.nn.relu,
            hidden_sizes=(100, 100),
        )

        if initial_params_file is not None:
            policy = None
Esempio n. 55
0
from rllab.algos.ddpg import DDPG
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import run_experiment_lite
from rllab.exploration_strategies.ou_strategy import OUStrategy
from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction

from rllab.envs.mujoco.simple_humanoid_env import SimpleHumanoidEnv
env = normalize(SimpleHumanoidEnv())

# H_layer_first = [32, 100, 400]
# H_layer_second = [32, 100, 300]

H_layer_first = [32]
H_layer_second = [32]

# reward_scaling = [0.01, 0.1, 1.0]
reward_scaling = [0.01]

# critic_learning_rate = [1e-3, 10e-3]
# actor_learning_rate = [1e-4, 10e-4]

critic_learning_rate = [0.001]
actor_learning_rate = [0.0001]

#0.99 was originally set by rllab
discount_factor = 0.99

#originally : 32 set by rllab
size_of_batch = 64
mode = "local"
n_parallel = 4

exp_dir = '/home/lsy/Desktop/rllab/data/local/egoSwimmer-snn/'
for dir in os.listdir(exp_dir):
    if 'Figure' not in dir and os.path.isfile(
            os.path.join(exp_dir, dir, 'params.pkl')):
        pkl_path = os.path.join(exp_dir, dir, 'params.pkl')
        print("hier for : ", pkl_path)

        for time_step_agg in [10, 50, 100]:

            for activity_range in [6, 10, 15]:
                inner_env = normalize(
                    SwimmerGatherEnv(activity_range=activity_range,
                                     sensor_range=activity_range,
                                     sensor_span=math.pi * 2,
                                     ego_obs=True))
                env = hierarchize_snn(
                    inner_env,
                    time_steps_agg=time_step_agg,
                    pkl_path=pkl_path,
                    # animate=True,
                )

                policy = CategoricalMLPPolicy(env_spec=env.spec, )

                baseline = LinearFeatureBaseline(env_spec=env.spec)

                # bonus_evaluators = [GridBonusEvaluator(mesh_density=mesh_density, visitation_bonus=1, snn_H_bonus=0)]
                # reward_coef_bonus = [reward_coef]
Esempio n. 57
0
if use_tf:
    from sandbox.rocky.tf.algos.trpo import TRPO
    from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
    from sandbox.rocky.tf.envs.base import TfEnv
else:
    from rllab.algos.trpo import TRPO
    from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.gym_env import GymEnv
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import stub, run_experiment_lite

stub(globals())

#env = normalize(GymEnv("Pendulum-v0"))
env = normalize(GymEnv("Walker2d-v1"))

if use_tf:
    env = TfEnv(env)
    policy = GaussianMLPPolicy(
        name='policy',
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )
else:
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )
from rllab.envs.gym_env import GymEnv

from railrl.predictors.dynamics_model import ConvEncoder, InverseModel, ForwardModel
from railrl.algos.icm_trpo_tf import ICM

import itertools

import tensorflow as tf

stub(globals())

# Params range
seeds = range(0, 3)

for seed in seeds:
    env = TfEnv(normalize(env=GymEnv('Box3dReachPixel-v8',record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))
    
    env_spec = env.spec
    cnn = ConvNetwork(
        name="conv_feature_network",
        input_shape=env_spec.observation_space.shape,
        output_dim=env_spec.action_space.flat_dim,
        conv_filters=(32, 32, 32, 32, 32),
        conv_filter_sizes=((3,3),(3,3),(3,3),(3,3), (3,3)),
        conv_strides=(2, 2, 2, 2, 2),
        conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'),
        hidden_sizes=(256,),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=None,
    )
Esempio n. 59
0
step_sizes = [0.5, 0.5, 0.5,0.0, 0.5]
initial_params_files = [initial_params_file1, initial_params_file3, None,initial_params_file4]
gen_name = 'icml_point_results_'
names = ['maml','maml0','random','oracle']

exp_names = [gen_name + name for name in names]

all_avg_returns = []
for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files):
    avg_returns = []
    for goal in goals:
        goal = list(goal)


        if initial_params_file is not None and 'oracle' in initial_params_file:
            env = normalize(PointEnvRandGoalOracle(goal=goal))
            n_itr = 1
        else:
            env = normalize(PointEnvRandGoal(goal=goal))
            n_itr = 5
        env = TfEnv(env)
        policy = GaussianMLPPolicy(  # random policy
            name='policy',
            env_spec=env.spec,
            hidden_sizes=(100, 100),
        )


        if initial_params_file is not None:
            policy = None
Esempio n. 60
0
def run_task(*_):
    env = normalize(GymEnv(args.env))
    # env.wrapped_env.env.env.env.reward_flag = 'absolute'
    env.wrapped_env.env.env.reward_flag = args.reward

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    learn_std = True
    init_std = 2

    # hidden_sizes=(8,)
    hidden_sizes = (32, 32)
    # hidden_sizes=(100, 50, 25)

    policy = GaussianMLPPolicy(env_spec=env.spec,
                               hidden_sizes=hidden_sizes,
                               learn_std=learn_std,
                               init_std=init_std)

    # =======================
    # Defining the algorithm
    # =======================
    batch_size = 5000
    n_itr = args.n_itr
    gamma = .9
    step_size = 0.01

    if args.algorithm == 0:
        algo = VPG(env=env,
                   policy=policy,
                   baseline=baseline,
                   batch_size=batch_size,
                   n_itr=n_itr,
                   discount=gamma,
                   step_size=step_size)
    if args.algorithm == 1:
        algo = TRPO(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=batch_size,
                    n_itr=n_itr,
                    discount=gamma,
                    step_size=step_size)
    if args.algorithm == 2:
        algo = TNPG(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=batch_size,
                    n_itr=n_itr,
                    discount=gamma,
                    step_size=step_size)
    # if args.algorithm == 4:
    # algo = DDPG(
    # env=env,
    # policy=policy,
    # baseline=baseline,
    # batch_size=batch_size,
    # n_itr=n_itr,
    # discount=gamma,
    # step_size=step_size
    # )
    algo.train()

    return algo