def run_task(*_): f = open('/home/qingkai/verina.csv', "w+") ff = open('/home/qingkai/cpo_dual.csv', "w+") trpo_stepsize = 0.01 trpo_subsample_factor = 0.2 env = AntGatherEnv(apple_reward=10,bomb_cost=1,n_apples=2, activity_range=6) policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64,32) ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args={ 'hidden_sizes': (64,32), 'hidden_nonlinearity': NL.tanh, 'learn_std':False, 'step_size':trpo_stepsize, 'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor) } ) safety_baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args={ 'hidden_sizes': (64,32), 'hidden_nonlinearity': NL.tanh, 'learn_std':False, 'step_size':trpo_stepsize, 'optimizer':ConjugateGradientOptimizer(subsample_factor=trpo_subsample_factor) }, target_key='safety_returns', ) safety_constraint = GatherSafetyConstraint(max_value=0.2, baseline=safety_baseline) algo = CPO( env=env, policy=policy, baseline=baseline, safety_constraint=safety_constraint, safety_gae_lambda=0.5, batch_size=100000, max_path_length=500, n_itr=2000, gae_lambda=0.95, discount=0.995, step_size=trpo_stepsize, optimizer_args={'subsample_factor':trpo_subsample_factor}, #plot=True, ) algo.train() f.close() ff.close()
def run_task(vv, log_dir=None, exp_name=None): global policy global baseline trpo_stepsize = 0.01 trpo_subsample_factor = 0.2 # Check if variant is available if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']: raise ValueError('Unrecognized model type for simulating robot') if vv['robot_type'] not in ['MRZR', 'RCCar']: raise ValueError('Unrecognized robot type') # Load environment if not vv['use_ros']: env = CircleEnv(target_velocity=vv['target_velocity'], radius=vv['radius'], dt=vv['dt'], model_type=vv['model_type'], robot_type=vv['robot_type']) else: from aa_simulation.envs.circle.circle_env_ros import CircleEnvROS env = CircleEnvROS(target_velocity=vv['target_velocity'], radius=vv['radius'], dt=vv['dt'], model_type=vv['model_type'], robot_type=vv['robot_type']) # Save variant information for comparison plots variant_file = logger.get_snapshot_dir() + '/variant.json' logger.log_variant(variant_file, vv) # Set variance for each action component separately for exploration # Note: We set the variance manually because we are not scaling our # action space during training. init_std_speed = vv['target_velocity'] / 4 init_std_steer = np.pi / 6 init_std = [init_std_speed, init_std_steer] # Build policy and baseline networks # Note: Mean of policy network set to analytically computed values for # faster training (rough estimates for RL to fine-tune). if policy is None or baseline is None: wheelbase = 0.257 target_velocity = vv['target_velocity'] target_steering = np.arctan(wheelbase / vv['radius']) # CCW output_mean = np.array([target_velocity, target_steering]) hidden_sizes = (32, 32) # In mean network, allow output b values to dominate final output # value by constraining the magnitude of the output W matrix. This is # to allow faster learning. These numbers are arbitrarily chosen. W_gain = min(vv['target_velocity'] / 5, np.pi / 15) mean_network = MLP(input_shape=(env.spec.observation_space.flat_dim, ), output_dim=env.spec.action_space.flat_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=LN.tanh, output_nonlinearity=None, output_W_init=LI.GlorotUniform(gain=W_gain), output_b_init=output_mean) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes, init_std=init_std, mean_network=mean_network) baseline = LinearFeatureBaseline(env_spec=env.spec, target_key='returns') # Reset variance to re-enable exploration when using pre-trained networks else: policy._l_log_std = ParamLayer( policy._mean_network.input_layer, num_units=env.spec.action_space.flat_dim, param=LI.Constant(np.log(init_std)), name='output_log_std', trainable=True) obs_var = policy._mean_network.input_layer.input_var mean_var, log_std_var = L.get_output( [policy._l_mean, policy._l_log_std]) policy._log_std_var = log_std_var LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std]) policy._f_dist = ext.compile_function(inputs=[obs_var], outputs=[mean_var, log_std_var]) safety_baseline = LinearFeatureBaseline(env_spec=env.spec, target_key='safety_returns') safety_constraint = CircleSafetyConstraint(max_value=1.0, eps=vv['eps'], baseline=safety_baseline) if vv['algo'] == 'TRPO': algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=600, max_path_length=env.horizon, n_itr=600, discount=0.99, step_size=trpo_stepsize, plot=False, ) else: algo = CPO(env=env, policy=policy, baseline=baseline, safety_constraint=safety_constraint, batch_size=600, max_path_length=env.horizon, n_itr=600, discount=0.99, step_size=trpo_stepsize, gae_lambda=0.95, safety_gae_lambda=1, optimizer_args={'subsample_factor': trpo_subsample_factor}, plot=False) algo.train()