def vpg_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) env = TfEnv(env_name='InvertedDoublePendulum-v2') runner = LocalRunner(ctxt) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = VPG(env_spec=env.spec, policy=policy, value_function=value_function, max_path_length=100, discount=0.99, center_adv=False) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=10000)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ env = TfEnv(env_name='InvertedDoublePendulum-v2') runner = LocalRunner(snapshot_config) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) algo = VPG(env_spec=env.spec, policy=policy, value_function=value_function, max_path_length=100, discount=0.99, center_adv=False) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=10000)
def __init__(self, env, policy, value_function, sampler, task_sampler, inner_lr=_Default(1e-2), outer_lr=1e-3, max_kl_step=0.01, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=40, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): policy_optimizer = OptimizerWrapper( (torch.optim.Adam, dict(lr=inner_lr)), policy) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = VPG(env.spec, policy, value_function, None, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) meta_optimizer = (ConjugateGradientOptimizer, dict(max_constraint_value=max_kl_step)) super().__init__(inner_algo=inner_algo, env=env, policy=policy, sampler=sampler, task_sampler=task_sampler, meta_optimizer=meta_optimizer, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates, meta_evaluator=meta_evaluator, evaluate_every_n_epochs=evaluate_every_n_epochs)
def load_vpg(env_name="CartPole-v0"): """Return an instance of the VPG algorithm.""" env = GarageEnv(env_name=env_name) policy = DeterministicMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) vfunc = GaussianMLPValueFunction(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, value_function=vfunc) return algo
def __init__(self, env, policy, baseline, inner_lr=_Default(1e-2), outer_lr=1e-3, max_kl_step=0.01, max_path_length=500, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=40, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): inner_algo = VPG(env.spec, policy, baseline, optimizer=torch.optim.Adam, policy_lr=inner_lr, max_path_length=max_path_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) meta_optimizer = (ConjugateGradientOptimizer, dict(max_constraint_value=max_kl_step)) super().__init__(inner_algo=inner_algo, env=env, policy=policy, baseline=baseline, meta_optimizer=meta_optimizer, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates, meta_evaluator=meta_evaluator, evaluate_every_n_epochs=evaluate_every_n_epochs)
def __init__(self, env, policy, value_function, inner_lr=_Default(1e-1), outer_lr=1e-3, max_path_length=100, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=20, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): inner_algo = VPG(env.spec, policy, value_function, optimizer=torch.optim.Adam, policy_lr=inner_lr, max_path_length=max_path_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) super().__init__(inner_algo=inner_algo, env=env, policy=policy, value_function=value_function, meta_optimizer=torch.optim.Adam, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates, meta_evaluator=meta_evaluator, evaluate_every_n_epochs=evaluate_every_n_epochs)
def run_task(snapshot_config, *_): """Run the job.""" env = TfEnv(env_name='InvertedDoublePendulum-v2') runner = LocalRunner(snapshot_config) policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, optimizer=torch.optim.Adam, baseline=baseline, max_path_length=100, discount=0.99, center_adv=False, policy_lr=1e-2) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=10000)
def train_batch( model, optimizer, baseline, epoch, batch_id, step, batch, tb_logger, opts ): x, bl_val = baseline.unwrap_batch(batch) ##x are states or nodes - (obs in vpg.py) x = move_to(x, opts.device) bl_val = move_to(bl_val, opts.device) if bl_val is not None else None # Evaluate model, get costs and log probabilities cost, log_likelihood = model(x) ##likelihood/probability is POLICY distribution used to pick actions or paths and cost is a vector with tour times(this could be rewards). Check _compute_policy_entropy in vpg.py #Check sizes print('---Checking data Sizes---') for key_x in x: print('Batch ID:', batch_id, '->', key_x, '->', x[key_x].shape) print('Batch ID:',batch_id,'-> Cost ->',cost.shape) #Synthetic construction of required Garage input obs_garage = x['loc'].clone().detach().cpu() rewards_garage = cost.clone().detach().cpu().numpy() #padded_rewards_garage = pad_tensor(rewards_garage, len(rewards_garage), mode='last') padded_rewards_garage = rewards_garage.reshape(rewards_garage.shape[0],1) lens = [(obs_garage.shape[1]-1) for i in range(obs_garage.shape[0])] eps_dict = { 'padded_observations': obs_garage, 'padded_rewards': padded_rewards_garage, 'lengths': lens, 'observations': obs_garage, 'rewards': rewards_garage, 'actions': obs_garage } eps = SimpleNamespace(**eps_dict) env_spec_dict = { 'max_episode_length': 20 } env_spec = SimpleNamespace(**env_spec_dict) vpg = VPG( env_spec = env_spec, policy = None, value_function = None, sampler = None, policy_optimizer = optimizer, vf_optimizer = optimizer ) print('VPG run' + str(vpg._train_once(1, eps))) # Evaluate baseline, get baseline loss if any (only for critic) bl_val, bl_loss = baseline.eval(x, cost) if bl_val is None else (bl_val, 0) #print('VPG run loss: ' + str(vpg._compute_advantage(cost, lens, bl_val))) # Calculate loss reinforce_loss = ((cost - bl_val) * log_likelihood).mean() loss = reinforce_loss + bl_loss # Perform backward pass and optimization step optimizer.zero_grad() loss.backward() # Clip gradient norms and get (clipped) gradient norms for logging grad_norms = clip_grad_norms(optimizer.param_groups, opts.max_grad_norm) optimizer.step() # Logging if step % int(opts.log_step) == 0: log_values(cost, grad_norms, epoch, batch_id, step, log_likelihood, reinforce_loss, bl_loss, tb_logger, opts)