def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianGRUPolicy(env_spec=env.spec, hidden_sizes=(64, )) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=horizon * 32 * 2, max_path_length=horizon, # whole_paths=True, n_itr=400, discount=0.999, # step_size=0.01, ) algo.train()
def run_task(*_): env = normalize( GymEnv("DartHopperRSS-v1", record_log=False, record_video=False)) mp_dim = 1 #policy_pre = joblib.load('data/trained/gradient_temp/rl_split_hopper_3models_taskinput_6432net_sd4_splitstd_maskedgrad_specbaseline_40k_70_30_unweighted_accumulate_gradient/final_policy_0.1.pkl') split_dim = 0 policy = GaussianGRUPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(64, ), ) print('trainable parameter size: ', policy.get_param_values(trainable=True).shape) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) #policy = params['policy'] #baseline = params['baseline'] algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=75000, max_path_length=env.horizon, n_itr=800, discount=0.99, step_size=0.01, gae_lambda=0.98, #mp_dim = mp_dim, #epopt_epsilon = 1.0, #epopt_after_iter = 0, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, whole_paths=False, ) algo.train()
def run_task(*_): env = normalize(CartpoleEnv()) policy = GaussianGRUPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) algo.train()
from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp from rllab.misc.instrument import stub, run_experiment_lite stub(globals()) env = normalize(CartpoleEnv()) policy = GaussianGRUPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) run_experiment_lite( algo.train(), n_parallel=1, seed=1, )
def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" sumo_params = SumoParams(sim_step=0.1, sumo_binary="sumo", seed=0) vehicles = Vehicles() vehicles.add(veh_id="rl", acceleration_controller=(RLController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=1) vehicles.add(veh_id="idm", acceleration_controller=(IDMController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=21) additional_env_params = { "target_velocity": 8, "ring_length": [220, 270], "max_accel": 1, "max_decel": 1 } env_params = EnvParams(horizon=HORIZON, additional_params=additional_env_params, warmup_steps=750) additional_net_params = { "length": 260, "lanes": 1, "speed_limit": 30, "resolution": 40 } net_params = NetParams(additional_params=additional_net_params) initial_config = InitialConfig(spacing="uniform", bunching=50) print("XXX name", exp_tag) scenario = LoopScenario(exp_tag, CircleGenerator, vehicles, net_params, initial_config=initial_config) env_name = "WaveAttenuationPOEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianGRUPolicy( env_spec=env.spec, hidden_sizes=(5, ), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=3600 * 72 * 2, max_path_length=horizon, n_itr=5, # whole_paths=True, discount=0.999, # step_size=v["step_size"], ) algo.train(),
discriminator = Mlp_Discriminator( a_max=0.2, a_min=0.2, disc_window=obs_window, iteration=total_iter, disc_joints_dim=2, hidden_sizes=(8, 8)) # initializing env = normalize(SimpleHumanoidEnv(discriminator=discriminator, window=obs_window), normalize_obs=True) if Policy=="MLP": policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 50, 25) ) if Policy=="GRU": policy=GaussianGRUPolicy( env_spec=env.spec, hidden_sizes=(64,), state_include_action=False, hidden_nonlinearity=NL.tanh) if policy==None: print("not valid policy type") else: base_line_optimizer = ConjugateGradientOptimizer() baseline = GaussianMLPBaseline(env.spec, regressor_args={ "mean_network": None, "hidden_sizes": (100, 50, 25), "hidden_nonlinearity": NL.tanh, "optimizer": base_line_optimizer, "use_trust_region": True, "step_size": 0.01,
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.95) parser.add_argument('--gae_lambda', type=float, default=0.99) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--buffer_size', type=int, default=1) parser.add_argument('--n_good', type=int, default=3) parser.add_argument('--n_hostage', type=int, default=5) parser.add_argument('--n_bad', type=int, default=5) parser.add_argument('--n_coop_save', type=int, default=2) parser.add_argument('--n_coop_avoid', type=int, default=2) parser.add_argument('--n_sensors', type=int, default=20) parser.add_argument('--sensor_range', type=float, default=0.2) parser.add_argument('--save_reward', type=float, default=3) parser.add_argument('--hit_reward', type=float, default=-1) parser.add_argument('--encounter_reward', type=float, default=0.01) parser.add_argument('--bomb_reward', type=float, default=-10.) parser.add_argument('--recurrent', action='store_true', default=False) parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128') parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help= 'Whether to only print the tabular log information (in a horizontal format)' ) args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) centralized = True if args.control == 'centralized' else False sensor_range = np.array(map(float, args.sensor_range.split(','))) assert sensor_range.shape == (args.n_pursuers, ) env = ContinuousHostageWorld(args.n_good, args.n_hostage, args.n_bad, args.n_coop_save, args.n_coop_avoid, n_sensors=args.n_sensors, sensor_range=args.sensor_range, save_reward=args.save_reward, hit_reward=args.hit_reward, encounter_reward=args.encounter_reward, bomb_reward=args.bomb_reward) env = RLLabEnv(StandardizedEnv(env), mode=args.control) if args.buffer_size > 1: env = ObservationBuffer(env, args.buffer_size) if args.recurrent: policy = GaussianGRUPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes) else: policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) else: baseline = ZeroBaseline(obsfeat_space) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, n_itr=args.n_iter, discount=args.discount, step_size=args.max_kl, mode=args.control, ) algo.train()
from madrl_environments import StandardizedEnv from madrl_environments.pursuit import MAWaterWorld from rllabwrapper import RLLabEnv from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy env = StandardizedEnv(MAWaterWorld(3, 10, 2, 5)) env = RLLabEnv(env) policy = GaussianGRUPolicy(env_spec=env.spec, hidden_sizes=(32,)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=8000, max_path_length=200, n_itr=500, discount=0.99, step_size=0.01, mode='decentralized',) algo.train()
from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy from rllab.envs.gym_env import GymEnv import rllab.misc.logger as logger env = normalize(GymEnv('BipedalWalker-v2')) #policy = GaussianMLPPolicy( # env_spec=env.spec, # # The neural network policy should have two hidden layers, each with 32 hidden units. # hidden_sizes=(64, 64) #) policy = GaussianGRUPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, )) baseline = LinearFeatureBaseline(env_spec=env.spec) # logger LOG_DIR = 'walker_gru_test' tabular_log_file = osp.join(LOG_DIR, 'progress.csv') text_log_file = osp.join(LOG_DIR, 'debug.log') params_log_file = osp.join(LOG_DIR, 'params.json') logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) logger.set_snapshot_dir(LOG_DIR) logger.set_snapshot_mode('last')
from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy from rllab.envs.normalized_env import normalize import numpy as np import theano import theano.tensor as TT from lasagne.updates import adam import pdb # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianGRUPolicy(env.spec) # Initialize a linear baseline estimator using default hand-crafted features baseline = LinearFeatureBaseline(env.spec) # We will collect 100 trajectories per iteration N = 100 # Each trajectory will have at most 100 time steps T = 100 # Number of iterations n_itr = 100 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.1 # Construct the computation graph