def create_policy_rllab(policy, env, weights): # Create policy obs_dim = env.observation_space.flat_dim action_dim = env.action_space.flat_dim if policy == 'linear': hidden_sizes = tuple() elif policy == 'simple-nn': hidden_sizes = [16] else: raise Exception('NOT IMPLEMENTED.') # Creating the policy mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, output_b_init=None, output_W_init=LI.Normal(), ) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_sizes, mean_network=mean_network) # Set the weights if weights is not None: raise Exception('TODO load pickle file.') else: weights = WEIGHTS policy.set_param_values(weights) return policy
def run_task(*_): env = normalize( GymEnv("DartHopper-v1", record_log=False, record_video=False)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 64), net_mode=0, ) print('trainable parameter size: ', policy.get_param_values(trainable=True).shape) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) algo = PPO_Clip_Sym( env=env, policy=policy, baseline=baseline, batch_size=20000, max_path_length=env.horizon, n_itr=200, discount=0.99, step_size=0.02, gae_lambda=0.97, whole_paths=False, observation_permutation=np.array( [0.0001, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), action_permutation=np.array([0.0001, 1, 2]), sym_loss_weight=0.0, ) algo.train()
def run_multi_vpg_stein_no_critic(*_): env = normalize(env_name()) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(25, 16,), adaptive_std=False, ) policy_list = [GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(25, 16,), adaptive_std=False, ) for i in range(num_of_agents)] baseline = LinearFeatureBaseline(env_spec=env.spec) baseline_list = [LinearFeatureBaseline(env_spec=env.spec) for i in range(num_of_agents)] print("Iteration Number: {:}".format(n_itr)) print("Learning Rate : {:}".format(learning_rate)) algo = VPG_multi_Stein( num_of_agents = num_of_agents, temp = temperature, env=env, policy=policy, policy_list = policy_list, baseline=baseline, baseline_list=baseline_list, batch_size=batch_size, max_path_length=500, n_itr=n_itr, discount=0.99, learning_rate = learning_rate, with_critic = True, ) algo.train()
def test_trpo_deterministic_nan(): env = DummyEnv() policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(1,)) policy._l_log_std.param.set_value([np.float32(np.log(1e-8))]) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100, step_size=0.01 ) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def test_trpo_relu_nan(): env = DummyEnv() policy = GaussianMLPPolicy( env_spec=env.spec, hidden_nonlinearity=naive_relu, hidden_sizes=(1,)) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100, step_size=0.001 ) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def rllab_trpo_launcher(variant): from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.algos.trpo import TRPO from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from railrl.launchers.launcher_util import get_env_settings import lasagne.nonlinearities as NL env_settings = get_env_settings(**variant['env_params']) env = env_settings['env'] policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 32), output_nonlinearity=NL.tanh, ) baseline = LinearFeatureBaseline( env.spec, ) batch_size = 5000 algorithm = TRPO( env=env, policy=policy, baseline=baseline, whole_paths=True, max_path_length=500, n_itr=1000, step_size=0.01, subsample_factor=1.0, ) algorithm.train()
def run_task(*_): # Please note that different environments with different action spaces may # require different policies. For example with a Discrete action space, a # CategoricalMLPPolicy works, but for a Box action space may need to use # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example) env = normalize( GymEnv(env_name="LunarLanderContinuous-v2", force_reset=True)) # policy = CategoricalMLPPolicy( # env_spec=env.spec, # # The neural network policy should have two hidden layers, each with 32 hidden units. # hidden_sizes=(32, 32) # ) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(64, 64)) baseline = LinearFeatureBaseline(env_spec=env.spec) # max_path_length = env.horizon algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=300, n_itr=10000, discount=0.99, # step_size=0.02, truncate_local_is_ratio=0.2 # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(Gomoku()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(100, 50, 25), output_nonlinearity=NL.tanh) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=20000, max_path_length=110, n_itr=500, discount=0.995, step_size=0.01, gae_lambda=0.97, #epopt_epsilon = 1.0, #epopt_after_iter = 0, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_vpg_baseline_large_batch_size_no_critic(*_): env = normalize(env_name()) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=( 100, 50, 25, ), adaptive_std=False, ) baseline = LinearFeatureBaseline(env_spec=env.spec) print("Iteration Number: {:}".format(n_itr)) print("Learning Rate : {:}".format(learning_rate)) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=batch_size * num_of_agents, max_path_length=500, n_itr=n_itr, discount=0.99, optimizer_args={'learning_rate': learning_rate}, sampler_cls=BatchSampler_no_critic, ) algo.train()
def run_task(*_): env_name = "BottleneckEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=20000, max_path_length=horizon, # whole_paths=True, n_itr=400, discount=0.995, # step_size=0.01, ) algo.train()
def run_task(*_): env = normalize(TendonTwoSegmentSE3Env()) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64) # output_nonlinearity=NL.tanh # std_hidden_nonlinearity=NL.rectify, # hidden_nonlinearity=NL.rectify, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=10, max_path_length=np.inf, n_itr=10, discount=0.99, step_size=0.01, gae_lambda=1, ) algo.train()
def run_task(vv, log_dir=None, exp_name=None): # Load environment radius = vv['radius'] target_velocity = vv['target_velocity'] env = normalize(CircleEnv(radius, target_velocity)) # Save variant information for comparison plots variant_file = logger.get_snapshot_dir() + '/variant.json' logger.log_variant(variant_file, vv) # Train policy using TRPO policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=1000, max_path_length=env.horizon, n_itr=1000, discount=0.99, step_size=0.01, plot=False, ) algo.train()
def run_task(*_): # Please note that different environments with different action spaces may require different # policies. For example with a Box action space, a GaussianMLPPolicy works, but for a Discrete # action space may need to use a CategoricalMLPPolicy (see the trpo_gym_cartpole.py example) env = normalize(GymEnv("Pendulum-v0", record_video=False, force_reset=True)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5, symmetric=False)) # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(_): env_name = "PlatooningEnv" register( id=env_name+'-v0', entry_point='platooning_env:{}'.format(env_name), max_episode_steps=HORIZON, kwargs={"env_params": ENV_PARAMS} ) env = GymEnv(env_name, record_video=False) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(16, 16, 16), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=15000, max_path_length=horizon, n_itr=1000, # whole_paths=True, discount=0.999, ) algo.train(),
def run_task(*_): # env = normalize(HalfCheetahEnv()) env = GymEnv(env_name = "MountainCarContinuous-v0", force_reset=True) # baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = ZeroBaseline(env_spec=env.spec) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers hidden_sizes=(64, 64) ) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=100, max_path_length=100, n_itr=10000, discount=0.99, optimizer_args=dict( learning_rate=0.01, ) ) algo.train()
def run_task(*_): # Please note that different environments with different action spaces may require different # policies. For example with a Box action space, a GaussianMLPPolicy works, but for a Discrete # action space may need to use a CategoricalMLPPolicy (see the trpo_gym_cartpole.py example) env = normalize(GymEnv("Reacher-v1")) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=200, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): env = normalize(Cassie2dEnv()) if load_policy: filename = "123" data = joblib.load(filename) policy = data['policy'] print("Loading Pretrained Policy ...............................") else: policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), init_std=1.0, #adaptive_std=True, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=1000, # dt = (1/2000)*n, where n is Step(n) n_itr=400, discount=0.99, step_size=0.005, # default was 0.01 # Uncomment both lines (this and the plot parameter below) to enable plotting plot=False, ) algo.train()
def run_task(*_): f = open('/home/qingkai/verina.csv', "w+") ff = open('/home/qingkai/cpo_dual.csv', "w+") trpo_stepsize = 0.01 trpo_subsample_factor = 0.2 env = AntGatherEnv(apple_reward=10,bomb_cost=1,n_apples=2, activity_range=6) policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64,32) ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args={ 'hidden_sizes': (64,32), 'hidden_nonlinearity': NL.tanh, 'learn_std':False, 'step_size':trpo_stepsize, 'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor) } ) safety_baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args={ 'hidden_sizes': (64,32), 'hidden_nonlinearity': NL.tanh, 'learn_std':False, 'step_size':trpo_stepsize, 'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor) }, target_key='safety_returns', ) safety_constraint = GatherSafetyConstraint(max_value=0.2, baseline=safety_baseline) algo = CPO( env=env, policy=policy, baseline=baseline, safety_constraint=safety_constraint, safety_gae_lambda=0.5, batch_size=100000, max_path_length=500, n_itr=2000, gae_lambda=0.95, discount=0.995, step_size=trpo_stepsize, optimizer_args={'subsample_factor':trpo_subsample_factor}, #plot=True, ) algo.train() f.close() ff.close()
def train(env, policy, policy_init, num_episodes, episode_cap, horizon, **alg_args): # Getting the environment env_class = rllab_env_from_name(env) env = normalize(env_class()) # Policy initialization if policy_init == 'zeros': initializer = LI.Constant(0) elif policy_init == 'normal': initializer = LI.Normal() else: raise Exception('Unrecognized policy initialization.') # Setting the policy type if policy == 'linear': hidden_sizes = tuple() elif policy == 'simple-nn': hidden_sizes = [16] else: raise Exception('NOT IMPLEMENTED.') # Creating the policy obs_dim = env.observation_space.flat_dim action_dim = env.action_space.flat_dim mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, output_b_init=None, output_W_init=initializer, ) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_sizes, mean_network=mean_network, log_weights=True, ) # Creating baseline baseline = LinearFeatureBaseline(env_spec=env.spec) # Adding max_episodes constraint. If -1, this is unbounded if episode_cap: alg_args['max_episodes'] = num_episodes # Run algorithm algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=horizon * num_episodes, whole_paths=True, max_path_length=horizon, **alg_args) algo.train()
def run_task(*_): sumo_params = SumoParams(sim_step=0.1, sumo_binary="sumo") vehicles = Vehicles() vehicles.add(veh_id="rl", acceleration_controller=(RLController, {}), routing_controller=(ContinuousRouter, {}), speed_mode="no_collide", num_vehicles=1) vehicles.add(veh_id="idm", acceleration_controller=(IDMController, {"noise": 0.2}), routing_controller=(ContinuousRouter, {}), speed_mode="no_collide", num_vehicles=13) additional_env_params = {"target_velocity": 20, "max_accel": 3, "max_decel": 3} env_params = EnvParams(horizon=HORIZON, additional_params=additional_env_params) additional_net_params = {"radius_ring": 30, "lanes": 1, "speed_limit": 30, "resolution": 40} net_params = NetParams(no_internal_links=False, additional_params=additional_net_params) initial_config = InitialConfig(spacing="uniform") print("XXX name", exp_tag) scenario = Figure8Scenario(exp_tag, Figure8Generator, vehicles, net_params, initial_config=initial_config) env_name = "AccelEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(16, 16) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=15000, max_path_length=horizon, n_itr=500, # whole_paths=True, discount=0.999, # step_size=v["step_size"], ) algo.train(),
def test_baseline(baseline_cls): env = CartpoleEnv() policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(6,)) baseline = baseline_cls(env_spec=env.spec) algo = VPG( env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100 ) algo.train()
def run_task(*_): env = normalize( GymEnv("DartWalker3d-v1", record_log=False, record_video=False)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 64), net_mode=0, ) #policy = joblib.load('data/local/experiment/walker3d_symmetry1_sd13_2alivebonus_2velrew_targetvelocity1_15frameskip_5en1absenergypenalty_2d_hardvelenforce_contsupport/policy.pkl') # increase policy std a bit for exploration #policy.get_params()[-1].set_value(policy.get_params()[-1].get_value() + 0.5) print('trainable parameter size: ', policy.get_param_values(trainable=True).shape) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) algo = TRPO_Symmetry( env=env, policy=policy, baseline=baseline, batch_size=60000, max_path_length=env.horizon, n_itr=500, discount=0.99, step_size=0.02, gae_lambda=0.97, observation_permutation=np.array([0.0001,-1, 2,-3,-4, -5,-6,7, 14,-15,-16, 17, 18,-19, 8,-9,-10, 11, 12,-13,\ 20,21,-22, 23,-24,-25, -26,-27,28, 35,-36,-37, 38, 39,-40, 29,-30,-31, 32, 33,-34, 42, 41]), #observation_permutation=np.array([0.0001, 1, 5,6,7, 2,3,4, 8,9,10, 14,15,16, 11,12,13]), #action_permutation=np.array([3,4,5, 0.00001,1,2]), action_permutation=np.array([-0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8]), sym_loss_weight=2.0, whole_paths=False, ) algo.train()
def run_task(*_): env = normalize( GymEnv("DartHumanWalker-v1", record_log=False, record_video=False)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 64), net_mode=0, ) #policy = joblib.load('data/local/experiment/humanwalker_symmetry1_sd11_1alivebonus_2velrew_targetvelocity1_15frameskip_5en1absenergypenalty_spd20002000/policy.pkl') # increase policy std a bit for exploration #policy.get_params()[-1].set_value(policy.get_params()[-1].get_value() + 0.5) print('trainable parameter size: ', policy.get_param_values(trainable=True).shape) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) algo = TRPO_Symmetry( env=env, policy=policy, baseline=baseline, batch_size=50000, max_path_length=env.horizon, n_itr=1000, discount=0.99, step_size=0.02, gae_lambda=0.97, observation_permutation=np.array([0.0001,-1,2,-3,-4, -11,12,-13,14,15,16, -5,6,-7,8,9,10, -17,18, -19, -24,25,-26,27, -20,21,-22,23,\ 28,29,-30,31,-32,-33, -40,41,-42,43,44,45, -34,35,-36,37,38,39, -46,47, -48, -53,54,-55,56, -49,50,-51,52, 58,57]), action_permutation=np.array([-6,7,-8, 9, 10,11, -0.001,1,-2, 3, 4,5, -12,13, -14, -19,20,-21,22, -15,16,-17,18]), sym_loss_weight=1.0, action_reg_weight=0.0, whole_paths=False, ) algo.train()
def run_task(v): which_agent = v["which_agent"] env, _ = create_env(which_agent) baseline = LinearFeatureBaseline(env_spec=env.spec) optimizer_params = dict(base_eps=1e-5) #how many iters num_trpo_iters = 2500 if (which_agent == 1): num_trpo_iters = 2500 if (which_agent == 2): steps_per_rollout = 333 num_trpo_iters = 200 if (which_agent == 4): num_trpo_iters = 2000 if (which_agent == 6): num_trpo_iters = 2000 #recreate the policy policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(v["depth_fc_layers"], v["depth_fc_layers"]), init_std=v["std_on_mlp_policy"]) all_params = np.concatenate( (v["policy_values"], policy._l_log_std.get_params()[0].get_value())) policy.set_param_values(all_params) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=v["trpo_batchsize"], max_path_length=v["steps_per_rollout"], n_itr=num_trpo_iters, discount=0.995, optimizer=v["ConjugateGradientOptimizer"]( hvp_approach=v["FiniteDifferenceHvp"](**optimizer_params)), step_size=0.05, plot_true=True) #train the policy algo.train()
def buildPolicy(polValTpl, env): #directory to save results for this run - built off policy architecture snapShotDir = 'snapShots' + '_'.join([str(x) for x in polValTpl]) print('snapShotDir : ' + snapShotDir) if not os.path.exists(snapShotDir): os.makedirs(snapShotDir) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=polValTpl) return policy, snapShotDir
def run_task(*_): env = normalize(PointEnv()) policy = GaussianMLPPolicy(env_spec=env.spec,) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=500,discount=0.99,step_size=0.01 ) algo.train()
def test_issue_3(): """ As reported in https://github.com/rllab/rllab/issues/3, the adaptive_std parameter was not functioning properly """ env = CartpoleEnv() policy = GaussianMLPPolicy(env_spec=env, adaptive_std=True) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=100, n_itr=1) algo.train()
def run_task(*_): """Implement the ``run_task`` method needed to run experiments with rllab. Note that the flow-specific parameters are imported at the start of this script and unzipped and processed here. """ env_name = flow_params["env_name"] exp_tag = flow_params["exp_tag"] sumo_params = flow_params["sumo"] vehicles = flow_params["veh"] env_params = flow_params["env"] net_params = flow_params["net"] initial_config = flow_params.get("initial", InitialConfig()) traffic_lights = flow_params.get("tls", TrafficLights()) # import the scenario and generator classes module = __import__("flow.scenarios", fromlist=[flow_params["scenario"]]) scenario_class = getattr(module, flow_params["scenario"]) module = __import__("flow.scenarios", fromlist=[flow_params["generator"]]) generator_class = getattr(module, flow_params["generator"]) # create the scenario object scenario = scenario_class(name=exp_tag, generator_class=generator_class, vehicles=vehicles, net_params=net_params, initial_config=initial_config, traffic_lights=traffic_lights) pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25)) baseline = LinearFeatureBaseline(env_spec=env.spec) horizon = flow_params["env"].horizon algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=horizon * (N_ROLLOUTS - PARALLEL_ROLLOUTS + 1), max_path_length=horizon, n_itr=500, discount=0.999, step_size=0.01, ) algo.train(),
def run_task(v): print("_________________________________") print("#################################") print("_________________________________") print("_________________________________") print("#################################") print("### agents_number : " + str(agents_number) + " ####") print("### ####") print("### participation_rate : " + str(participation_rate) + " ####") print("### ####") print("### average_period : " + str(average_period) + " ####") print("### ####") print("### quantization_tuning : " + str(quantization_tuning) + " ####") print("### ####") print("### discount : " + str(discount) + " ####") print("#################################") print("_________________________________") print("_________________________________") print("#################################") print("_________________________________") env = normalize(CartpoleEnv()) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = Server( participation_rate=participation_rate, agents_number=agents_number, average_period=average_period, env=env, policy=policy, baseline=baseline, difference_params=True, quantize=True, quantization_tuning=quantization_tuning, batch_size=400, max_path_length=100, n_itr=50, discount=discount, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_task(*_): tot_cars = 6 auton_cars = 6 sumo_params = SumoParams(time_step=0.1, rl_speed_mode="no_collide", sumo_binary="sumo-gui") vehicles = Vehicles() vehicles.add_vehicles("rl", (RLController, {}), (StaticLaneChanger, {}), (ContinuousRouter, {}), 0, auton_cars) env_params = EnvParams(additional_params={"target_velocity": 25, "num_steps": 1000}) additional_net_params = {"length": 220, "lanes": 1, "speed_limit": 30, "resolution": 40} net_params = NetParams(additional_params=additional_net_params) initial_config = InitialConfig() scenario = LoopScenario("rl-test", CircleGenerator, vehicles, net_params, initial_config) env_name = "SimpleAccelerationEnvironment" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) logging.info("Experiment Set Up complete") policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(16,) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=2000, max_path_length=horizon, # whole_paths=True, n_itr=2, # 1000 # discount=0.99, # step_size=0.01, ) algo.train()
from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.normalized_env import normalize import numpy as np import theano import theano.tensor as TT from lasagne.updates import adam # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,)) # Initialize a linear baseline estimator using default hand-crafted features baseline = LinearFeatureBaseline(env.spec) # We will collect 100 trajectories per iteration N = 100 # Each trajectory will have at most 100 time steps T = 100 # Number of iterations n_itr = 100 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.1 # Construct the computation graph
def doit(mode): from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.normalized_env import normalize import numpy as np import theano import theano.tensor as TT from lasagne.updates import adam # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,)) # Initialize a linear baseline estimator using default hand-crafted features if "linbaseline" in mode: print('linear baseline') baseline = LinearFeatureBaseline(env.spec) elif "vanilla" in mode: print("zero baseline") baseline = ZeroBaseline(env.spec) elif mode == "batchavg": print('batch average baseline') # use a zero baseline but subtract the mean of the discounted returns (see below) baseline = ZeroBaseline(env.spec) if "_ztrans" in mode: print('z transform advantages') else: print('no z transform') # We will collect 100 trajectories per iteration N = 50 # Each trajectory will have at most 100 time steps T = 50 # Number of iterations n_itr = 50 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.1 # Construct the computation graph # Create a Theano variable for storing the observations # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However, # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data # type for the variable. For instance, for an environment with discrete observations, we might want to use integer # types if the observations are represented as one-hot vectors. observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1 ) actions_var = env.action_space.new_tensor_variable( 'actions', extra_dims=1 ) advantages_var = TT.vector('advantages') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True ) results = [] for _ in range(n_itr): paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It should at least contain entries that would be # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym(). # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is # not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic information about the environment. In our # case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), ) path_baseline = baseline.predict(path) advantages = [] returns = [] return_so_far = 0 for t in range(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + discount * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # The advantages are stored backwards in time, so we need to revert it advantages = np.array(advantages[::-1]) # And we need to do the same thing for the list of returns returns = np.array(returns[::-1]) if "_ztrans" in mode: advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.append(path) baseline.fit(paths) observations = np.concatenate([p["observations"] for p in paths]) actions = np.concatenate([p["actions"] for p in paths]) advantages = np.concatenate([p["advantages"] for p in paths]) if mode == 'batchavg': # in this case `advantages` up to here are just our good old returns, without baseline or z transformation. # now we subtract their mean across all episodes. advantages = advantages - np.mean(advantages) f_train(observations, actions, advantages) avgr = np.mean([sum(p["rewards"]) for p in paths]) print(('Average Return:',avgr)) results.append(avgr) return results