def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, ) M = variant['layer_size'] qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=(M, M), ) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=32, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=False, ) algorithm.train()
def run_task(*_): # Please note that different environments with different action spaces may require different # policies. For example with a Box action space, a GaussianMLPPolicy works, but for a Discrete # action space may need to use a CategoricalMLPPolicy (see the trpo_gym_cartpole.py example) env = normalize(GymEnv("Pendulum-v0")) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(CartpoleEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec) algo = DDPG( env=env, policy=policy, es=es, qf=qf, batch_size=32, max_path_length=100, epoch_length=1000, min_pool_size=10000, n_epochs=1000, discount=0.99, scale_reward=0.01, qf_learning_rate=1e-3, policy_learning_rate=1e-4, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def test(): env = normalize(MultiGoalEnv()) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=1e6, ) base_kwargs = dict( min_pool_size=100, epoch_length=100, n_epochs=1000, max_path_length=30, batch_size=64, n_train_repeat=1, eval_render=True, eval_n_episodes=10, ) M = 128 policy = StochasticNNPolicy( env.spec, hidden_layer_sizes=(M, M), squash=True) qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=[M, M]) plotter = QFPolicyPlotter( qf=qf, policy=policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, plotter=plotter, policy_lr=3e-4, qf_lr=3e-4, value_n_particles=16, td_target_update_interval=1000, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=32, kernel_update_ratio=0.5, discount=0.99, reward_scale=0.1, save_full_state=False, ) algorithm.train()
def run_experiment(**params): base_params = copy.copy(DEFAULTS) base_params.update(params) params = base_params pprint(params) grid_world = SlaveGridWorldEnv("walled_chain", max_traj_length=DEFAULTS["max_path_length"], goal_reward=params["goal_reward"]) agent = GridWorldMasterAgent(grid_world, match_reward=params["match_reward"]) env = normalize(SituatedConversationEnvironment(env=grid_world, b_agent=agent)) baseline = LinearFeatureBaseline(env) policy = RecurrentCategoricalPolicy( name="policy", env_spec=env.spec, hidden_dims=params["policy_hidden_dims"], feature_network=MLPNetworkWithEmbeddings( "feature_network", env.observation_space.flat_dim, params["feature_dim"], params["feature_hidden_dims"], tf.tanh, tf.tanh, agent.vocab_size, params["embedding_dim"]), state_include_action=False, ) optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=params["batch_size"], max_path_length=params["max_path_length"], n_itr=params["n_itr"], discount=0.99, step_size=params["step_size"], optimizer=optimizer, ) run_experiment_lite( algo.train(), n_parallel=15, snapshot_mode="last", exp_prefix="grid_world_sweep3", variant=params, )
def run_task(*_): env = normalize(GymEnv("Pendulum-v0")) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, plot=True, ) algo.train()
def run_experiment(**params): base_params = copy.copy(DEFAULTS) base_params.update(params) params = base_params grid_world = SlaveGridWorldEnv("3x3", goal_reward=params["goal_reward"]) env = normalize(grid_world) baseline = LinearFeatureBaseline(env) policy = CategoricalMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=params["policy_hidden_dims"], ) optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=params["batch_size"], max_path_length=5, n_itr=params["n_itr"], discount=0.99, step_size=params["step_size"], optimizer=optimizer, ) run_experiment_lite( algo.train(), n_parallel=5, snapshot_mode="last", exp_prefix="grid_world_silent", variant=params, )
def run_task(v): env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=v["step_size"], # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
variants = VG().variants() max_path_length = 200 num_grad_updates = 1 use_maml = True for v in variants: task_var = v['task_var'] oracle = v['oracle'] if task_var == 0: task_var = 'direc' exp_prefix = 'bugfix_trpo_maml_antdirec' + str(max_path_length) if oracle: env = TfEnv(normalize(AntEnvDirecOracle())) else: env = TfEnv(normalize(AntEnvRandDirec())) elif task_var == 1: task_var = 'vel' exp_prefix = 'posticml_trpo_maml_ant' + str(max_path_length) if oracle: env = TfEnv(normalize(AntEnvOracle())) else: env = TfEnv(normalize(AntEnvRand())) elif task_var == 2: task_var = 'pos' exp_prefix = 'posticml_trpo_maml_antpos_' + str(max_path_length) if oracle: env = TfEnv(normalize(AntEnvRandGoalOracle())) else:
def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" v_enter = 10 inner_length = 300 long_length = 100 short_length = 300 n = 3 m = 3 num_cars_left = 1 num_cars_right = 1 num_cars_top = 1 num_cars_bot = 1 tot_cars = (num_cars_left + num_cars_right) * m \ + (num_cars_bot + num_cars_top) * n grid_array = { "short_length": short_length, "inner_length": inner_length, "long_length": long_length, "row_num": n, "col_num": m, "cars_left": num_cars_left, "cars_right": num_cars_right, "cars_top": num_cars_top, "cars_bot": num_cars_bot } sumo_params = SumoParams(sim_step=1, render=True) vehicles = Vehicles() vehicles.add(veh_id="idm", acceleration_controller=(SumoCarFollowingController, {}), sumo_car_following_params=SumoCarFollowingParams( min_gap=2.5, tau=1.1, max_speed=v_enter), routing_controller=(GridRouter, {}), num_vehicles=tot_cars, speed_mode="all_checks") tl_logic = TrafficLights(baseline=False) additional_env_params = { "target_velocity": 50, "switch_time": 3.0, "num_observed": 2, "discrete": False, "tl_type": "controlled" } env_params = EnvParams(additional_params=additional_env_params) additional_net_params = { "speed_limit": 35, "grid_array": grid_array, "horizontal_lanes": 1, "vertical_lanes": 1 } initial_config, net_params = get_flow_params(10, 300, n, m, additional_net_params) scenario = SimpleGridScenario(name="grid-intersection", vehicles=vehicles, net_params=net_params, initial_config=initial_config, traffic_lights=tl_logic) env_name = "PO_TrafficLightGridEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=40000, max_path_length=horizon, # whole_paths=True, n_itr=800, discount=0.999, # step_size=0.01, ) algo.train()
return [2] # should also code up alternative KL thing variants = VG().variants() max_path_length = 200 num_grad_updates = 1 use_maml=True for v in variants: task_var = v['task_var'] if task_var == 0: env = TfEnv(normalize(AntEnvRandDirec())) task_var = 'direc' elif task_var == 1: env = TfEnv(normalize(AntEnvRand())) task_var = 'vel' elif task_var == 2: env = TfEnv(normalize(AntEnvRandGoal())) task_var = 'pos' policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=v['fast_lr'], hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100,100), )
from humanoidopt.env import HumanoidOptEnv from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy env = normalize(HumanoidOptEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=0.01, ) algo.train()
from rllab.misc.instrument import VariantGenerator, stub, run_experiment_lite from sandbox.rocky.tf.algos.trpo import TRPO from sandbox.rocky.tf.core import layers as L from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp from praglang.environments import BagAutoencoderEnvironment from praglang.policies import RecurrentCategoricalPolicy from praglang.util import MLPNetworkWithEmbeddings stub(globals()) LENGTH = 5 VOCAB = list("abcdefghijklmnopqrstuvwxyz") env = normalize(BagAutoencoderEnvironment(VOCAB, LENGTH, "autoenc")) DEFAULTS = { "batch_size": 5000, "n_itr": 500, "step_size": 0.001, "policy_hidden_dims": (128,), "embedding_dim": 32, "feature_dim": 128, "feature_hidden_dims": (), } config.LOG_DIR = "./log" def run_experiment(params):
def main(): parser = argparse.ArgumentParser() parser.add_argument('env_fname', type=str, help='config file with environment arguments') parser.add_argument('transformers_fname', type=str) parser.add_argument('mean_network_type', type=str, choices=['conv', 'siamese']) parser.add_argument('--conv_filters', nargs='*', type=int, default=[16, 32]) parser.add_argument('--hidden_sizes', nargs='*', type=int, default=[16]) parser.add_argument('--init_std', type=float, default=1.0) parser.add_argument('--n_itr', type=int, default=100) parser.add_argument('--step_size', type=float, default=0.01) parser.add_argument('--batch_size', type=int, default=10000) parser.add_argument('--use_static_car', action='store_true') parser.add_argument('--use_init_heuristic', action='store_true') args = parser.parse_args() with open(args.env_fname) as yaml_string: env_config = yaml.load(yaml_string) if issubclass(env_config['class'], envs.RosEnv): import rospy rospy.init_node("generate_data") env = from_config(env_config) if args.use_static_car: env.car_env.speed_offset_space.low = \ env.car_env.speed_offset_space.high = np.array([0.0, 4.0]) # transformers with open(args.transformers_fname) as transformers_file: transformers_config = yaml.load(transformers_file) transformers = dict() for data_name, transformer_config in transformers_config.items(): if data_name == 'action': replace_config = {'space': env.action_space} elif data_name in env.observation_space.spaces: replace_config = {'space': env.observation_space.spaces[data_name]} else: replace_config = {} transformers[data_name] = from_config(transformers_config[data_name], replace_config=replace_config) env = ServoingEnv(env) env = RllabEnv(env, transformers=transformers) env = normalize(env) network_kwargs = dict( input_shape=env.observation_space.shape, output_dim=env.action_space.flat_dim, conv_filters=args.conv_filters, conv_filter_sizes=[3] * len(args.conv_filters), conv_strides=[2] * len(args.conv_filters), conv_pads=[0] * len(args.conv_filters), hidden_sizes=args.hidden_sizes, hidden_nonlinearity=LN.rectify, output_nonlinearity=None, name="mean_network", ) if args.mean_network_type == 'conv': mean_network = ConvNetwork(**network_kwargs) elif args.mean_network_type == 'siamese': mean_network = SiameseQuadraticErrorNetwork(**network_kwargs) else: raise NotImplementedError policy = GaussianConvPolicy( env_spec=env.spec, init_std=args.init_std, mean_network=mean_network, ) if args.use_init_heuristic: W_var = policy.get_params()[0] W = W_var.get_value() W[:, 3:, :, :] = -W[:, :3, :, :] W_var.set_value(W) baseline = GaussianConvBaseline( env_spec=env.spec, regressor_args=dict( use_trust_region=True, step_size=args.step_size, normalize_inputs=True, normalize_outputs=True, hidden_sizes=args.hidden_sizes, conv_filters=args.conv_filters, conv_filter_sizes=[3] * len(args.conv_filters), conv_strides=[2] * len(args.conv_filters), conv_pads=[0] * len(args.conv_filters), batchsize=args.batch_size * 10, )) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.batch_size, max_path_length=100, n_itr=args.n_itr, discount=0.9, step_size=args.step_size, ) algo.train() import IPython as ipy ipy.embed()
def doit(mode): from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.normalized_env import normalize import numpy as np import theano import theano.tensor as TT from lasagne.updates import adam # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,)) # Initialize a linear baseline estimator using default hand-crafted features if "linbaseline" in mode: print('linear baseline') baseline = LinearFeatureBaseline(env.spec) elif "vanilla" in mode: print("zero baseline") baseline = ZeroBaseline(env.spec) elif mode == "batchavg": print('batch average baseline') # use a zero baseline but subtract the mean of the discounted returns (see below) baseline = ZeroBaseline(env.spec) if "_ztrans" in mode: print('z transform advantages') else: print('no z transform') # We will collect 100 trajectories per iteration N = 50 # Each trajectory will have at most 100 time steps T = 50 # Number of iterations n_itr = 50 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.1 # Construct the computation graph # Create a Theano variable for storing the observations # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However, # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data # type for the variable. For instance, for an environment with discrete observations, we might want to use integer # types if the observations are represented as one-hot vectors. observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1 ) actions_var = env.action_space.new_tensor_variable( 'actions', extra_dims=1 ) advantages_var = TT.vector('advantages') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True ) results = [] for _ in range(n_itr): paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It should at least contain entries that would be # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym(). # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is # not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic information about the environment. In our # case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), ) path_baseline = baseline.predict(path) advantages = [] returns = [] return_so_far = 0 for t in range(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + discount * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # The advantages are stored backwards in time, so we need to revert it advantages = np.array(advantages[::-1]) # And we need to do the same thing for the list of returns returns = np.array(returns[::-1]) if "_ztrans" in mode: advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.append(path) baseline.fit(paths) observations = np.concatenate([p["observations"] for p in paths]) actions = np.concatenate([p["actions"] for p in paths]) advantages = np.concatenate([p["advantages"] for p in paths]) if mode == 'batchavg': # in this case `advantages` up to here are just our good old returns, without baseline or z transformation. # now we subtract their mean across all episodes. advantages = advantages - np.mean(advantages) f_train(observations, actions, advantages) avgr = np.mean([sum(p["rewards"]) for p in paths]) print(('Average Return:',avgr)) results.append(avgr) return results
rd.seed(seed) ### seed %= 4294967294 global seed_ seed_ = seed rd.seed(seed) np.random.seed(seed) try: import tensorflow as tf tf.set_random_seed(seed) except Exception as e: print(e) print('using seed %s' % (str(seed))) env = TfEnv(normalize(PointEnvRandGoal())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), std_modifier=pre_std_modifier, ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif 'linear' in bas: baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = GaussianMLPBaseline(env_spec=env.spec) #expert_policy = PointEnvExpertPolicy(env_spec=env.spec)
] other_env_class_map = {"Cartpole": CartpoleEnv} if args.env in supported_gym_envs: gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) # gymenv.env.seed(1) else: gymenv = other_env_class_map[args.env]() #TODO: assert continuous space env = TfEnv(normalize(gymenv)) policy = DeterministicMLPPolicy( env_spec=env.spec, name="policy", # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(100, 50, 25), hidden_nonlinearity=tf.nn.relu, ) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=tf.nn.relu,
step_sizes = [0.5, 0.5, 0.5,0.0, 0.5] initial_params_files = [initial_params_file1, initial_params_file3, None,initial_params_file4] gen_name = 'icml_point_results_' names = ['maml','maml0','random','oracle'] exp_names = [gen_name + name for name in names] all_avg_returns = [] for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files): avg_returns = [] for goal in goals: goal = list(goal) if initial_params_file is not None and 'oracle' in initial_params_file: env = normalize(PointEnvRandGoalOracle(goal=goal)) n_itr = 1 else: env = normalize(PointEnvRandGoal(goal=goal)) n_itr = 5 env = TfEnv(env) policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None
from sandbox.rocky.tf.algos.pg_stein import PGStein from rllab.envs.box2d.double_pendulum_env import DoublePendulumEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.normalized_env import normalize from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv from rllab.misc.instrument import stub, run_experiment_lite stub(globals()) env = TfEnv(normalize(DoublePendulumEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(100, 50, 25), adaptive_std=True, std_hidden_sizes=(100,25), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PGStein( env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=500, n_itr=100,
import os.path as osp PROJECT_PATH = osp.abspath(osp.dirname(__file__)) #hyper paramerters num_of_generations = 201 num_of_steps = 10000 now = datetime.datetime.now(dateutil.tz.tzlocal()) timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') is_render = False # config file config_path = 'config/asteroids' env = normalize(normalize(GymEnv("Asteroids-ramNoFrameskip-v0"))) policy = PowerGradientPolicy( env_spec=env.spec, neat_output_dim=(64, ), # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(64, 32)) # Load policy parameters = weights and bias of pretrained network policy.load_policy('policy_parameters/model-asteroids.npz') def do_rollout(agent, render=False): rewards = [] for i in range(10): ob = env.reset() t = 0
from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.gym_env import GymEnv from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction from rllab.algos.ddpg import DDPG stub(globals()) env = normalize(GymEnv("Quad-v0")) use_trpo = True if use_trpo: policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 50, 25), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline,
rand_step_test_rew_summary = data['rand_step_test'] adv_test_rew_summary = data['adv_test'] ne = data['exp_save'] ni = data['iter_save'] save_prefix = 'BASELINE-env-{}_{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}'.format( env_name, adv_name, n_exps, n_itr, batch_size, adv_fraction, step_size, gae_lambda) save_dir = os.environ['HOME'] + '/btpstuff/rllab-adv/results/baselines' fig_dir = 'figs' save_name = save_dir + '/' + save_prefix + '.p' fig_name = fig_dir + '/' + save_prefix + '.png' while ne < n_exps: ## Environment definition ## env = normalize(GymEnv(env_name, adv_fraction)) ## Protagonist policy definition ## pro_policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=layer_size, is_protagonist=True) pro_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Zero Adversary for the protagonist training ## zero_adv_policy = ConstantControlPolicy(env_spec=env.spec, is_protagonist=False, constant_val=0.0) ## Optimizer for the Protagonist ## from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_process) if adv_name == 'no_adv':
from envs.bullet.cartpole_bullet import CartPoleBulletEnv from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.gym_env import GymEnv from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy env = normalize(GymEnv("CartPoleBulletEnv-v0")) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(8,) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=5000, max_path_length=env.horizon, n_itr=50, discount=0.999, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
from rllab.algos.ddpg_polyRL import DDPG from rllab.envs.normalized_env import normalize from rllab.misc.instrument import run_experiment_lite from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.exploration_strategies.persistence_length_2D_v2 import Persistence_Length_Exploration from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) def run_task(*_): """ DPG on Swimmer environment """ env = normalize(SwimmerEnv()) """ Initialise the policy as a neural network policy """ policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) """ Defining exploration strategy : OUStrategy - """ """ This strategy implements the Ornstein-Uhlenbeck process, which adds time-correlated noise to the actions taken by the deterministic policy.
fast_learning_rates = [0.1] baselines = ['linear'] fast_batch_size = 20 meta_batch_size = 60 max_path_length = 10 num_grad_updates = 1 meta_step_size = 0.01 use_maml = True for fast_learning_rate in fast_learning_rates: for bas in baselines: stub(globals()) env = TfEnv(normalize(GridWorldEnvRand('four-state'))) policy = MAMLCategoricalMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100,100), ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif 'linear' in bas: baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = GaussianMLPBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env,
def run_experiment(variant): env_params = variant['env_params'] policy_params = variant['policy_params'] value_fn_params = variant['value_fn_params'] algorithm_params = variant['algorithm_params'] replay_buffer_params = variant['replay_buffer_params'] sampler_params = variant['sampler_params'] task = variant['task'] domain = variant['domain'] env = normalize(ENVIRONMENTS[domain][task](**env_params)) pool = SimpleReplayBuffer(env_spec=env.spec, **replay_buffer_params) sampler = SimpleSampler(**sampler_params) base_kwargs = dict(algorithm_params['base_kwargs'], sampler=sampler) M = value_fn_params['layer_size'] qf1 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf1') qf2 = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M), name='qf2') vf = NNVFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) initial_exploration_policy = UniformPolicy(env_spec=env.spec) if policy_params['type'] == 'gaussian': policy = GaussianPolicy( env_spec=env.spec, hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], reg=1e-3, ) elif policy_params['type'] == 'lsp': nonlinearity = { None: None, 'relu': tf.nn.relu, 'tanh': tf.nn.tanh }[policy_params['preprocessing_output_nonlinearity']] preprocessing_hidden_sizes = policy_params.get( 'preprocessing_hidden_sizes') if preprocessing_hidden_sizes is not None: observations_preprocessor = MLPPreprocessor( env_spec=env.spec, layer_sizes=preprocessing_hidden_sizes, output_nonlinearity=nonlinearity) else: observations_preprocessor = None policy_s_t_layers = policy_params['s_t_layers'] policy_s_t_units = policy_params['s_t_units'] s_t_hidden_sizes = [policy_s_t_units] * policy_s_t_layers bijector_config = { 'num_coupling_layers': policy_params['coupling_layers'], 'translation_hidden_sizes': s_t_hidden_sizes, 'scale_hidden_sizes': s_t_hidden_sizes, } policy = LatentSpacePolicy( env_spec=env.spec, squash=policy_params['squash'], bijector_config=bijector_config, reparameterize=policy_params['reparameterize'], q_function=qf1, observations_preprocessor=observations_preprocessor) elif policy_params['type'] == 'gmm': # reparameterize should always be False if using a GMMPolicy policy = GMMPolicy( env_spec=env.spec, K=policy_params['K'], hidden_layer_sizes=(M, M), reparameterize=policy_params['reparameterize'], qf=qf1, reg=1e-3, ) else: raise NotImplementedError(policy_params['type']) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=pool, qf1=qf1, qf2=qf2, vf=vf, lr=algorithm_params['lr'], scale_reward=algorithm_params['scale_reward'], discount=algorithm_params['discount'], tau=algorithm_params['tau'], reparameterize=algorithm_params['reparameterize'], target_update_interval=algorithm_params['target_update_interval'], action_prior=policy_params['action_prior'], save_full_state=False, ) algorithm._sess.run(tf.global_variables_initializer()) algorithm.train()
from sandbox.rocky.tf.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.envs.normalized_env import normalize from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import FiniteDifferenceHvp from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv from rllab.misc.instrument import stub, run_experiment_lite stub(globals()) env = TfEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000,
def run_FaReLI(input_feed=None): beta_adam_steps_list = [(1,50)] # beta_curve = [250,250,250,250,250,5,5,5,5,1,1,1,1,] # make sure to check maml_experiment_vars # beta_curve = [1000] # make sure to check maml_experiment_vars adam_curve = [250,249,248,247,245,50,50,10] # make sure to check maml_experiment_vars # adam_curve = None fast_learning_rates = [1.0] baselines = ['linear',] # linear GaussianMLP MAMLGaussianMLP zero env_option = '' # mode = "ec2" mode = "local" extra_input = "onehot_exploration" # "onehot_exploration" "gaussian_exploration" # extra_input = None extra_input_dim = 5 # extra_input_dim = None goals_suffixes = ["_200_40_1"] #,"_200_40_2", "_200_40_3","_200_40_4"] # goals_suffixes = ["_1000_40"] fast_batch_size_list = [20] # 20 # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] #inner grad update size meta_batch_size_list = [40] # 40 @ 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 100 # 100 num_grad_updates = 1 meta_step_size = 0.01 pre_std_modifier_list = [1.0] post_std_modifier_train_list = [0.00001] post_std_modifier_test_list = [0.00001] l2loss_std_mult_list = [1.0] importance_sampling_modifier_list = [''] #'', 'clip0.5_' limit_demos_num_list = [1] # 40 test_goals_mult = 1 bas_lr = 0.01 # baseline learning rate momentum=0.5 bas_hnl = tf.nn.relu baslayers_list = [(32,32), ] basas = 60 # baseline adam steps use_corr_term = True seeds = [1] #,2,3,4,5] envseeds = [6] use_maml = True test_on_training_goals = False for goals_suffix in goals_suffixes: for envseed in envseeds: for seed in seeds: for baslayers in baslayers_list: for fast_batch_size in fast_batch_size_list: for meta_batch_size in meta_batch_size_list: for ism in importance_sampling_modifier_list: for limit_demos_num in limit_demos_num_list: for l2loss_std_mult in l2loss_std_mult_list: for post_std_modifier_train in post_std_modifier_train_list: for post_std_modifier_test in post_std_modifier_test_list: for pre_std_modifier in pre_std_modifier_list: for fast_learning_rate in fast_learning_rates: for beta_steps, adam_steps in beta_adam_steps_list: for bas in baselines: stub(globals()) tf.set_random_seed(seed) np.random.seed(seed) rd.seed(seed) env = TfEnv(normalize(Reacher7DofMultitaskEnv(envseed=envseed))) exp_name = str( 'R7_IL' # +time.strftime("%D").replace("/", "")[0:4] + goals_suffix + "_" + str(seed) # + str(envseed) + ("" if use_corr_term else "nocorr") # + str(int(use_maml)) + ('_fbs' + str(fast_batch_size) if fast_batch_size!=20 else "") + ('_mbs' + str(meta_batch_size) if meta_batch_size!=40 else "") + ('_flr' + str(fast_learning_rate) if fast_learning_rate!=1.0 else "") + '_dem' + str(limit_demos_num) + ('_ei' + str(extra_input_dim) if type( extra_input_dim) == int else "") # + '_tgm' + str(test_goals_mult) # +'metalr_'+str(meta_step_size) # +'_ngrad'+str(num_grad_updates) + ("_bs" + str(beta_steps) if beta_steps != 1 else "") + "_as" + str(adam_steps) # +"_net" + str(net_size[0]) # +"_L2m" + str(l2loss_std_mult) + ("_prsm" + str( pre_std_modifier) if pre_std_modifier != 1 else "") # + "_pstr" + str(post_std_modifier_train) # + "_posm" + str(post_std_modifier_test) # + "_l2m" + str(l2loss_std_mult) + ("_" + ism if len(ism) > 0 else "") + "_bas" + bas[0] # +"_tfbe" # TF backend for baseline # +"_qdo" # quad dist optimizer + (("_bi" if bas_hnl == tf.identity else ( "_brel" if bas_hnl == tf.nn.relu else "_bth")) # identity or relu or tanh for baseline # + "_" + str(baslayers) # size + "_baslr" + str(bas_lr) + "_basas" + str(basas) if bas[0] in ["G", "M"] else "") # baseline adam steps + ("r" if test_on_training_goals else "") + "_" + time.strftime("%d%m_%H_%M")) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), std_modifier=pre_std_modifier, # metalearn_baseline=(bas == "MAMLGaussianMLP"), extra_input_dim=(0 if extra_input is None else extra_input_dim), ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif bas == 'MAMLGaussianMLP': baseline = MAMLGaussianMLPBaseline(env_spec=env.spec, learning_rate=bas_lr, hidden_sizes=baslayers, hidden_nonlinearity=bas_hnl, repeat=basas, repeat_sym=basas, momentum=momentum, extra_input_dim=( 0 if extra_input is None else extra_input_dim), # learn_std=False, # use_trust_region=False, # optimizer=QuadDistExpertOptimizer( # name="bas_optimizer", # # tf_optimizer_cls=tf.train.GradientDescentOptimizer, # # tf_optimizer_args=dict( # # learning_rate=bas_lr, # # ), # # # tf_optimizer_cls=tf.train.AdamOptimizer, # # max_epochs=200, # # batch_size=None, # adam_steps=basas # ) ) elif bas == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif "GaussianMLP" in bas: baseline = GaussianMLPBaseline(env_spec=env.spec, regressor_args=dict( hidden_sizes=baslayers, hidden_nonlinearity=bas_hnl, learn_std=False, # use_trust_region=False, # normalize_inputs=False, # normalize_outputs=False, optimizer=QuadDistExpertOptimizer( name="bas_optimizer", # tf_optimizer_cls=tf.train.GradientDescentOptimizer, # tf_optimizer_args=dict( # learning_rate=bas_lr, # ), # # tf_optimizer_cls=tf.train.AdamOptimizer, # max_epochs=200, # batch_size=None, adam_steps=basas, use_momentum_optimizer=True, ))) algo = MAMLIL( env=env, policy=policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for alpha grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, # number of tasks sampled for beta grad update num_grad_updates=num_grad_updates, # number of alpha grad updates n_itr=800, #100 make_video=True, use_maml=use_maml, use_pooled_goals=True, use_corr_term=use_corr_term, test_on_training_goals=test_on_training_goals, metalearn_baseline=(bas=="MAMLGaussianMLP"), # metalearn_baseline=False, limit_demos_num=limit_demos_num, test_goals_mult=test_goals_mult, step_size=meta_step_size, plot=False, beta_steps=beta_steps, adam_curve=adam_curve, adam_steps=adam_steps, pre_std_modifier=pre_std_modifier, l2loss_std_mult=l2loss_std_mult, importance_sampling_modifier=MOD_FUNC[ism], post_std_modifier_train=post_std_modifier_train, post_std_modifier_test=post_std_modifier_test, expert_trajs_dir=EXPERT_TRAJ_LOCATION_DICT[env_option+"."+mode+goals_suffix+("_"+str(extra_input_dim) if type(extra_input_dim) == int else "")], expert_trajs_suffix=("_"+str(extra_input_dim) if type(extra_input_dim) == int else ""), seed=seed, extra_input=extra_input, extra_input_dim=(0 if extra_input is None else extra_input_dim), input_feed=input_feed, run_on_pr2=False, ) run_experiment_lite( algo.train(), n_parallel=1, snapshot_mode="last", python_command='python3', seed=seed, exp_prefix=str('R7_IL_' +time.strftime("%D").replace("/", "")[0:4]), exp_name=exp_name, plot=False, sync_s3_pkl=True, mode=mode, terminate_machine=True, )
gen_name = 'icml_ant_results_' names = ['maml','pretrain','random', 'oracle'] exp_names = [gen_name + name for name in names] step_sizes = [0.1, 0.2, 1.0, 0.0] initial_params_files = [file1, file2, None, file3] all_avg_returns = [] for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files): avg_returns = [] for goal in goals: if initial_params_file is not None and 'oracle' in initial_params_file: env = normalize(AntEnvOracle()) n_itr = 1 else: env = normalize(AntEnvRand()) n_itr = 4 env = TfEnv(env) policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None
from sandbox.rocky.tf.algos.vpg import VPG from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.envs.normalized_env import normalize from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv from rllab.misc.instrument import stub, run_experiment_lite stub(globals()) env = TfEnv(normalize(CartpoleEnv())) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=100, n_itr=40, discount=0.99, optimizer_args=dict(tf_optimizer_args=dict(learning_rate=0.01, ))) run_experiment_lite( algo.train(), n_parallel=2,
return res def dis_iw(iw): z = list() t = 1 for y in iw: z.append(y * t) t *= discount return np.array(z) load_policy = True # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(GymEnv("Swimmer-v1")) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(32, 32)) snap_policy = GaussianMLPPolicy(env.spec, hidden_sizes=(32, 32)) back_up_policy = GaussianMLPPolicy(env.spec, hidden_sizes=(32, 32)) parallel_sampler.populate_task(env, policy) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution snap_dist = snap_policy.distribution # We will collect 100 trajectories per iteration N = 100 # Each trajectory will have at most 100 time steps
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) tf_session = tf.Session() inner_env = normalize(AntEnv()) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], append_transformed_obs=v['append_transformed_obs'], append_extra_info=v['append_extra_info'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) if v['baseline'] == 'g_mlp': baseline = GaussianMLPBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) # GAN logger.log("Instantiating the GAN...") gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key} for key, value in gan_configs.items(): if value is tf.train.AdamOptimizer: gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key + '_stepSize']) if value is tflearn.initializations.truncated_normal: gan_configs[key] = tflearn.initializations.truncated_normal( stddev=gan_configs[key + '_stddev']) gan = StateGAN( state_size=v['goal_size'], evaluater_size=v['num_labels'], state_range=v['goal_range'], state_center=v['goal_center'], state_noise_level=v['goal_noise_level'], generator_layers=v['gan_generator_layers'], discriminator_layers=v['gan_discriminator_layers'], noise_size=v['gan_noise_size'], tf_session=tf_session, configs=gan_configs, ) # log first samples form the GAN initial_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) logger.log("Labeling the goals") labels = label_states(initial_goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(initial_goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) report.new_row() all_goals = StateCollection(distance_threshold=v['coll_eps']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) labels = np.ones((feasible_goals.shape[0], 2)).astype(np.float32) # make them all good goals plot_labeled_states(feasible_goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], summary_string_base='On-policy Goals:\n') if v['only_on_policy']: goals = feasible_goals[np.random.choice( feasible_goals.shape[0], v['num_new_goals'], replace=False), :] else: logger.log("Training the GAN") gan.pretrain(feasible_goals, v['gan_outer_iters']) # Sample GAN logger.log("Sampling goals from the GAN") raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: old_goals = all_goals.sample(v['num_old_goals']) goals = np.vstack([raw_goals, old_goals]) else: goals = raw_goals with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [goals, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=True, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [ goal for goal, label in zip(goals, labels) if label[0] == 1 ] # this is not used if no replay buffer all_goals.append(filtered_raw_goals) if v['add_on_policy']: logger.log("sampling on policy") feasible_goals = generate_initial_goals( env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:] all_goals.append(feasible_goals)
from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.mujoco.swimmer_env import SwimmerEnv from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy env = normalize(SwimmerEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=500, n_itr=40, discount=0.99, step_size=0.01, ) algo.train()
meta_iter = FLAGS.meta_iter meta_method = FLAGS.meta_method direc = FLAGS.direc mode = FLAGS.mode load_policy = FLAGS.load_policy # option max_path_length = 200 num_grad_updates = 1 num_leader_grad_updates = 2 stub(globals()) # task type if direc: env = TfEnv(normalize(HalfCheetahEnvRandDirec())) else: env = TfEnv(normalize(HalfCheetahEnvRand())) direc_str = 'direc' if direc else '' # svpg str if svpg: svpg_str = '_SVPG' + '_alpha' + str(svpg_alpha) else: svpg_str = '_VPG' # bmaml|emaml if svpg == False: maml_type = 'emaml' else: maml_type = 'bmaml'
def create_env_rllab(env, seed): env_name = re.match('rllab.(\S+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) env = normalize(env_rllab_class()) return env
from rllab.algos.spg_ddpg_unified import SPG_DDPG from rllab.envs.normalized_env import normalize from rllab.misc.instrument import run_experiment_lite from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy from rllab.policies.stochastic_mlp_policy import GaussianMLPPolicy from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction from rllab.envs.mujoco.hopper_env import HopperEnv env = normalize(HopperEnv()) def run_task(*_): env = normalize(HopperEnv()) # policy = DeterministicMLPPolicy( # env_spec=env.spec, # # The neural network policy should have two hidden layers, each with 32 hidden units. # hidden_sizes=(32, 32) # ) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec)
from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.gym_env import GymEnv from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy stub(globals()) env = normalize(GymEnv("Pendulum-v0")) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(8, 8) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01,
from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.normalized_env import normalize import numpy as np import theano import theano.tensor as TT from rllab.sampler import parallel_sampler from lasagne.updates import sgd from lasagne.updates import adam from rllab.misc import ext import pandas as pd load_policy = True # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8, ), learn_std=False) parallel_sampler.populate_task(env, policy) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # We will collect 100 trajectories per iteration N = 10 # Each trajectory will have at most 100 time steps T = 100 # Number of iterations n_itr = 1000
parser = argparse.ArgumentParser(description='Train a policy') parser.add_argument('-a', action="store", dest="alg") parser.add_argument('-e', action="store", dest="env") parsed = parser.parse_args() stub(globals()) alg = "DDPG" envs = {"Arm": ArmEnv, "Stand": StandEnv, "Gait": GaitEnv, "Crouch": CrouchEnv, "Hop": HopEnv} env = normalize(envs[parsed.env](visualize=False)) # env = normalize(CartpoleEnv()) # env = normalize(GymEnv("Pendulum-v0", record_video=False, record_log=False)) if alg == "DDPG": qf = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=(64, 64, 64) ) es = OUStrategy(env_spec=env.spec, theta = 0.5) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units.
# stub(globals()) # # supported_envs = ["MountainCar-v0", "CartPole-v0"] # # if args.env not in supported_envs: # raise Exception("Env not supported! Try it out though?") # Need to wrap in a tf environment and force_reset to true # see https://github.com/openai/rllab/issues/87#issuecomment-282519288 register_custom_envs() gymenv = GymEnv(args.env, force_reset=True) # gymenv.env.seed(124) env = TfEnv(normalize(gymenv, normalize_obs=False)) if env.spec.action_space == 'Discrete': policy = CategoricalMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) else: policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(100, 50, 25) )
from __future__ import print_function from __future__ import absolute_import from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.gym_env import GymEnv from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy stub(globals()) # env = normalize(GymEnv("Pendulum-v0", record_video=False)) env = normalize(GymEnv("VREP-v0", record_video=False)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 128) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, # batch_size=4000, batch_size=1000, max_path_length=env.horizon, n_itr=500,
step_sizes = [0.1, 0.2, 1.0, 0.0] initial_params_files = [file1, file2, None, file3] names = ['random'] exp_names = [gen_name + name for name in names] initial_params_files = [None] step_sizes = [0.5] all_avg_returns = [] for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files): avg_returns = [] for goal_i, goal in zip(range(len(goals)), goals): if initial_params_file is not None and 'oracle' in initial_params_file: env = normalize(HalfCheetahEnvDirecOracle()) n_itr = 1 else: env = normalize(HalfCheetahEnvRandDirec()) n_itr = 4 env = TfEnv(env) policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None
def run_task(*_): v_enter = 30 inner_length = 800 long_length = 100 short_length = 800 n = 1 m = 5 num_cars_left = 3 num_cars_right = 3 num_cars_top = 15 num_cars_bot = 15 tot_cars = (num_cars_left + num_cars_right) * m \ + (num_cars_bot + num_cars_top) * n grid_array = { "short_length": short_length, "inner_length": inner_length, "long_length": long_length, "row_num": n, "col_num": m, "cars_left": num_cars_left, "cars_right": num_cars_right, "cars_top": num_cars_top, "cars_bot": num_cars_bot } sumo_params = SumoParams(sim_step=1, sumo_binary="sumo-gui") vehicles = Vehicles() vehicles.add(veh_id="idm", acceleration_controller=(SumoCarFollowingController, {}), sumo_car_following_params=SumoCarFollowingParams( minGap=2.5, max_speed=v_enter, ), routing_controller=(GridRouter, {}), num_vehicles=tot_cars, speed_mode="all_checks") additional_env_params = { "target_velocity": 50, "num_steps": 500, "control-length": 150, "switch_time": 3.0 } env_params = EnvParams(additional_params=additional_env_params) additional_net_params = { "speed_limit": 35, "grid_array": grid_array, "horizontal_lanes": 1, "vertical_lanes": 1, "traffic_lights": True } initial_config, net_params = get_non_flow_params(10, additional_net_params) scenario = SimpleGridScenario(name="grid-intersection", generator_class=SimpleGridGenerator, vehicles=vehicles, net_params=net_params, initial_config=initial_config) env_name = "GreenWaveEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=40000, max_path_length=horizon, # whole_paths=True, n_itr=800, discount=0.999, # step_size=0.01, ) algo.train()
""" Returns a Space object """ low = np.array( [0, -np.pi / 2, -np.pi / 2, 0, -np.pi, -np.pi, 0, -np.pi, -np.pi]) high = np.array([ 100, np.pi / 2, np.pi / 2, 1000, np.pi, np.pi, 1000, np.pi, -np.pi ]) return Box(low=low, high=high) def log_diagnostics(self, paths): pass if __name__ == "__main__": from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy env = normalize(FlightEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, max_path_length=400, batch_size=4000, gae_lambda=0.7) algo.train()
parser.add_argument("--text_log_file", default="./data/debug.log", help="Where text output will go") parser.add_argument("--tabular_log_file", default="./data/progress.csv", help="Where tabular output will go") args = parser.parse_args() # stub(globals()) # ext.set_seed(1) logger.add_text_output(args.text_log_file) logger.add_tabular_output(args.tabular_log_file) logger.set_log_tabular_only(False) envs = [] for env_name in args.envs: gymenv = GymEnv(env_name, force_reset=True, record_video=False, record_log=False) env = TfEnv(normalize(gymenv)) envs.append((env_name, env)) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(100, 50, 25), hidden_nonlinearity=tf.nn.relu, ) baseline = LinearFeatureBaseline(env_spec=env.spec) with tf.Session() as sess: for env_name, env in envs:
rand_test_rew_summary = [] step_test_rew_summary = [] rand_step_test_rew_summary = [] adv_test_rew_summary = [] ## Preparing file to save results in ## save_prefix = 'env-{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}_{}'.format( env_name, n_exps, n_itr, batch_size, adv_fraction, step_size, gae_lambda, random.randint(0, 1000000)) save_name = save_dir + '/' + save_prefix + '.p' ## Looping over experiments to carry out ## for ne in range(n_exps): ## Environment definition ## ## The second argument in GymEnv defines the relative magnitude of adversary. For testing we set this to 1.0. env = normalize(GymEnv(env_name, adv_fraction)) env_orig = normalize(GymEnv(env_name, 1.0)) ## Protagonist policy definition ## pro_policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=layer_size, is_protagonist=True) pro_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Zero Adversary for the protagonist training ## zero_adv_policy = ConstantControlPolicy(env_spec=env.spec, is_protagonist=False, constant_val=0.0) ## Adversary policy definition ## adv_policy = GaussianMLPPolicy(env_spec=env.spec,
# should also code up alternative KL thing variants = VG().variants() max_path_length = 200 num_grad_updates = 1 use_maml=True for v in variants: direc = v['direc'] learning_rate = v['meta_step_size'] if direc: env = TfEnv(normalize(HalfCheetahEnvRandDirec())) else: env = TfEnv(normalize(HalfCheetahEnvRand())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=v['fast_lr'], hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100,100), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=policy, baseline=baseline,
def train(num_experiments, thread_id, queue): ############ DEFAULT PARAMETERS ############ env_name = None #Name of adversarial environment path_length = 1000 #Maximum episode length layer_size = tuple([100, 100, 100]) #Layer definition ifRender = False #Should we render? afterRender = 100 #After how many to animate n_exps = 1 #Number of training instances to run n_itr = 25 #Number of iterations of the alternating optimization n_pro_itr = 1 #Number of iterations for the protaginist n_adv_itr = 1 #Number of interations for the adversary batch_size = 4000 #Number of training samples for each iteration ifSave = True #Should we save? save_every = 100 #Save checkpoint every save_every iterations n_process = 1 #Number of parallel threads for sampling environment adv_fraction = 0.25 #Fraction of maximum adversarial force to be applied step_size = 0.01 #kl step size for TRPO gae_lambda = 0.97 #gae_lambda for learner save_dir = './results' #folder to save result in ############ ENV SPECIFIC PARAMETERS ############ env_name = 'Walker2dAdv-v1' layer_size = tuple([64, 64]) step_size = 0.1 gae_lambda = 0.97 batch_size = 25000 n_exps = num_experiments n_itr = 500 ifSave = False n_process = 4 adv_fraction = 5.0 adv_strengths = [] for i in range(0, int(adv_fraction) + 1, 1): adv_strengths.append(i) save_dir = './../results/AdvWalker' args = [ env_name, path_length, layer_size, ifRender, afterRender, n_exps, n_itr, n_pro_itr, n_adv_itr, batch_size, save_every, n_process, adv_fraction, step_size, gae_lambda, save_dir ] ############ ADVERSARIAL POLICY LOAD ############ filepath = './../initial_results/Walker/env-Walker2dAdv-v1_Exp1_Itr1500_BS25000_Adv0.25_stp0.01_lam0.97_507500.p' res_D = pickle.load(open(filepath, 'rb')) pretrained_adv_policy = res_D['adv_policy'] ############ MAIN LOOP ############ ## Initializing summaries for the tests ## const_test_rew_summary = [] rand_test_rew_summary = [] step_test_rew_summary = [] rand_step_test_rew_summary = [] adv_test_rew_summary = [] ## Preparing file to save results in ## save_prefix = 'static_env-{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}_{}'.format( env_name, n_exps, n_itr, batch_size, adv_fraction, step_size, gae_lambda, random.randint(0, 1000000)) save_name = save_dir + '/' + save_prefix ## Looping over experiments to carry out ## for ne in range(n_exps): ## Environment definition ## ## The second argument in GymEnv defines the relative magnitude of adversary. For testing we set this to 1.0. env = normalize(GymEnv(env_name, adv_fraction)) env_orig = normalize(GymEnv(env_name, 1.0)) ## Protagonist policy definition ## pro_policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=layer_size, is_protagonist=True) pro_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Zero Adversary for the protagonist training ## zero_adv_policy = ConstantControlPolicy(env_spec=env.spec, is_protagonist=False, constant_val=0.0) ## Adversary policy definition ## adv_policy = pretrained_adv_policy adv_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Initializing the parallel sampler ## parallel_sampler.initialize(n_process) ## Setting up summaries for testing for a specific training instance ## pro_rews = [] adv_rews = [] all_rews = [] const_testing_rews = [] const_testing_rews.append( test_const_adv(env_orig, pro_policy, path_length=path_length)) rand_testing_rews = [] rand_testing_rews.append( test_rand_adv(env_orig, pro_policy, path_length=path_length)) step_testing_rews = [] step_testing_rews.append( test_step_adv(env_orig, pro_policy, path_length=path_length)) rand_step_testing_rews = [] rand_step_testing_rews.append( test_rand_step_adv(env_orig, pro_policy, path_length=path_length)) adv_testing_rews = [] adv_testing_rews.append( test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length)) ## Loops through adversary strength levels n_loopsize = int(n_itr / len(adv_strengths)) for adv_index, adv_strength in enumerate(adv_strengths): env = normalize(GymEnv(env_name, adv_strength)) ## Optimizer for the Protagonist ## pro_algo = TRPO(env=env, pro_policy=pro_policy, adv_policy=adv_policy, pro_baseline=pro_baseline, adv_baseline=adv_baseline, batch_size=batch_size, max_path_length=path_length, n_itr=n_pro_itr, discount=0.995, gae_lambda=gae_lambda, step_size=step_size, is_protagonist=True) logger.log( '\n\nAdversarial Level: {} Adversarial Strength: {}\n'.format( adv_index, adv_strength)) ## Beginning alternating optimization ## for ni in range(n_loopsize): logger.log( '\n\nThread: {} Experiment: {} Iteration: {}\n'.format( thread_id, ne, ni + n_loopsize * adv_index, )) ## Train Protagonist pro_algo.train() pro_rews += pro_algo.rews all_rews += pro_algo.rews logger.log('Protag Reward: {}'.format( np.array(pro_algo.rews).mean())) ## Test the learnt policies const_testing_rews.append( test_const_adv(env, pro_policy, path_length=path_length)) rand_testing_rews.append( test_rand_adv(env, pro_policy, path_length=path_length)) step_testing_rews.append( test_step_adv(env, pro_policy, path_length=path_length)) rand_step_testing_rews.append( test_rand_step_adv(env, pro_policy, path_length=path_length)) adv_testing_rews.append( test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length)) if ni % afterRender == 0 and ifRender == True: test_const_adv(env, pro_policy, path_length=path_length, n_traj=1, render=True) if ni != 0 and ni % save_every == 0 and ifSave == True: ## SAVING CHECKPOINT INFO ## pickle.dump( { 'args': args, 'pro_policy': pro_policy, 'adv_policy': adv_policy, 'zero_test': [const_testing_rews], 'rand_test': [rand_testing_rews], 'step_test': [step_testing_rews], 'rand_step_test': [rand_step_testing_rews], 'iter_save': ni, 'exp_save': ne, 'adv_test': [adv_testing_rews] }, open( save_name + '_' + str(ni + n_loopsize * adv_index) + '.p', 'wb')) ## Shutting down the optimizer ## pro_algo.shutdown_worker() ## Updating the test summaries over all training instances const_test_rew_summary.append(const_testing_rews) rand_test_rew_summary.append(rand_testing_rews) step_test_rew_summary.append(step_testing_rews) rand_step_test_rew_summary.append(rand_step_testing_rews) adv_test_rew_summary.append(adv_testing_rews) queue.put([ const_test_rew_summary, rand_test_rew_summary, step_test_rew_summary, rand_step_test_rew_summary, adv_test_rew_summary ]) ############ SAVING MODEL ############ '''
class VG(VariantGenerator): @variant def step_size(self): return [0.01, 0.05, 0.1] @variant def seed(self): return [1, 11, 21, 31, 41] variants = VG().variants() for v in variants: env = TfEnv(normalize(GymEnv('HalfCheetah-v1', record_video=False, record_log=False))) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), name="policy" ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000,
# KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from ddpg import DDPG from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.envs.normalized_env import normalize from policies import DeterministicMLPPolicy from qfuncs import ContinuousMLPQ from strategies import OUStrategy from utils import SEED import mxnet as mx # set environment, policy, qfunc, strategy env = normalize(CartpoleEnv()) policy = DeterministicMLPPolicy(env.spec) qfunc = ContinuousMLPQ(env.spec) strategy = OUStrategy(env.spec) # set the training algorithm and train algo = DDPG( env=env, policy=policy, qfunc=qfunc, strategy=strategy, ctx=mx.gpu(0), max_path_length=100, epoch_length=1000,
gen_name = 'icml_antdirec_results_' names = ['maml','pretrain','random', 'oracle'] step_sizes = [0.1, 0.2, 1.0, 0.0] initial_params_files = [file1, file2, None, file3] exp_names = [gen_name + name for name in names] all_avg_returns = [] for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files): avg_returns = [] for goal_i, goal in zip(range(len(goals)), goals): if initial_params_file is not None and 'oracle' in initial_params_file: env = normalize(AntEnvDirecOracle()) n_itr = 1 else: env = normalize(AntEnvRandDirec()) n_itr = 4 env = TfEnv(env) policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None
ab_l1 = dict(mode='ours', mode2='ab_l1', scale=0.01, modelname='model/pushreal_l1/ablation_pushreal_L1_30000') seeds = [123] for params in [real_params]: for nvar in range(10): randparams = params['rand']() for modeparams in [ ab_l2 ]: #, ours_mode, ours_nofeat, ours_noimage, ab_l2l3, ab_l1]: copyparams = randparams.copy() copyparams.update(modeparams) mdp = normalize(GymEnv(params['env'], **copyparams)) for seed in seeds: policy = GaussianMLPPolicy(env_spec=mdp.spec, hidden_sizes=(32, 32), init_std=10) baseline = LinearFeatureBaseline(mdp.spec, ) batch_size = 50 * 250 algo = TRPO(env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=50, n_itr=100,
fast_learning_rates = [0.5] baselines = ['linear'] fast_batch_size = 20 # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] meta_batch_size = 40 # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 100 num_grad_updates = 1 meta_step_size = 0.01 use_maml = True for fast_learning_rate in fast_learning_rates: for learning_rate in learning_rates: for bas in baselines: stub(globals()) env = TfEnv(normalize(PointEnvRandGoal())) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100,100), ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif 'linear' in bas: baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = GaussianMLPBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env,
gen_name = 'icml_cheetah_results_' names = ['maml','pretrain','random', 'oracle'] exp_names = [gen_name + name for name in names] step_sizes = [0.1, 0.02, 0.1, 0.0] initial_params_files = [file1]#, None, None, None all_avg_returns = [] for step_i, initial_params_file in zip(range(len(step_sizes)), initial_params_files): avg_returns = [] for goal in goals: if initial_params_file is not None and 'oracle' in initial_params_file: env = normalize(HalfCheetahEnvOracle()) n_itr = 1 else: env = normalize(HalfCheetahEnvRandDisable()) n_itr = 5 env = TfEnv(env) policy = GaussianMLPPolicy( # random policy name='policy', env_spec=env.spec, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), ) if initial_params_file is not None: policy = None
from rllab.algos.ddpg import DDPG from rllab.envs.normalized_env import normalize from rllab.misc.instrument import run_experiment_lite from rllab.exploration_strategies.ou_strategy import OUStrategy from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction from rllab.envs.mujoco.simple_humanoid_env import SimpleHumanoidEnv env = normalize(SimpleHumanoidEnv()) # H_layer_first = [32, 100, 400] # H_layer_second = [32, 100, 300] H_layer_first = [32] H_layer_second = [32] # reward_scaling = [0.01, 0.1, 1.0] reward_scaling = [0.01] # critic_learning_rate = [1e-3, 10e-3] # actor_learning_rate = [1e-4, 10e-4] critic_learning_rate = [0.001] actor_learning_rate = [0.0001] #0.99 was originally set by rllab discount_factor = 0.99 #originally : 32 set by rllab size_of_batch = 64
mode = "local" n_parallel = 4 exp_dir = '/home/lsy/Desktop/rllab/data/local/egoSwimmer-snn/' for dir in os.listdir(exp_dir): if 'Figure' not in dir and os.path.isfile( os.path.join(exp_dir, dir, 'params.pkl')): pkl_path = os.path.join(exp_dir, dir, 'params.pkl') print("hier for : ", pkl_path) for time_step_agg in [10, 50, 100]: for activity_range in [6, 10, 15]: inner_env = normalize( SwimmerGatherEnv(activity_range=activity_range, sensor_range=activity_range, sensor_span=math.pi * 2, ego_obs=True)) env = hierarchize_snn( inner_env, time_steps_agg=time_step_agg, pkl_path=pkl_path, # animate=True, ) policy = CategoricalMLPPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) # bonus_evaluators = [GridBonusEvaluator(mesh_density=mesh_density, visitation_bonus=1, snn_H_bonus=0)] # reward_coef_bonus = [reward_coef]
if use_tf: from sandbox.rocky.tf.algos.trpo import TRPO from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy from sandbox.rocky.tf.envs.base import TfEnv else: from rllab.algos.trpo import TRPO from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.gym_env import GymEnv from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite stub(globals()) #env = normalize(GymEnv("Pendulum-v0")) env = normalize(GymEnv("Walker2d-v1")) if use_tf: env = TfEnv(env) policy = GaussianMLPPolicy( name='policy', env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) else: policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) )
from rllab.envs.gym_env import GymEnv from railrl.predictors.dynamics_model import ConvEncoder, InverseModel, ForwardModel from railrl.algos.icm_trpo_tf import ICM import itertools import tensorflow as tf stub(globals()) # Params range seeds = range(0, 3) for seed in seeds: env = TfEnv(normalize(env=GymEnv('Box3dReachPixel-v8',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) env_spec = env.spec cnn = ConvNetwork( name="conv_feature_network", input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.flat_dim, conv_filters=(32, 32, 32, 32, 32), conv_filter_sizes=((3,3),(3,3),(3,3),(3,3), (3,3)), conv_strides=(2, 2, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'), hidden_sizes=(256,), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None, )
def run_task(*_): env = normalize(GymEnv(args.env)) # env.wrapped_env.env.env.env.reward_flag = 'absolute' env.wrapped_env.env.env.reward_flag = args.reward baseline = LinearFeatureBaseline(env_spec=env.spec) learn_std = True init_std = 2 # hidden_sizes=(8,) hidden_sizes = (32, 32) # hidden_sizes=(100, 50, 25) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std) # ======================= # Defining the algorithm # ======================= batch_size = 5000 n_itr = args.n_itr gamma = .9 step_size = 0.01 if args.algorithm == 0: algo = VPG(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) if args.algorithm == 1: algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) if args.algorithm == 2: algo = TNPG(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) # if args.algorithm == 4: # algo = DDPG( # env=env, # policy=policy, # baseline=baseline, # batch_size=batch_size, # n_itr=n_itr, # discount=gamma, # step_size=step_size # ) algo.train() return algo