def __init__(self, inner_algo, env, policy, value_function, meta_optimizer, meta_batch_size=40, inner_lr=0.1, outer_lr=1e-3, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): if policy.vectorized: self.sampler_cls = OnPolicyVectorizedSampler else: self.sampler_cls = BatchSampler self.max_path_length = inner_algo.max_path_length self._meta_evaluator = meta_evaluator self._policy = policy self._env = env self._value_function = value_function self._num_grad_updates = num_grad_updates self._meta_batch_size = meta_batch_size self._inner_algo = inner_algo self._inner_optimizer = DifferentiableSGD(self._policy, lr=inner_lr) self._meta_optimizer = make_optimizer(meta_optimizer, policy, lr=_Default(outer_lr), eps=_Default(1e-5)) self._evaluate_every_n_epochs = evaluate_every_n_epochs
def __init__(self, inner_algo, env, policy, baseline, meta_optimizer, meta_batch_size=40, inner_lr=0.1, outer_lr=1e-3, num_grad_updates=1): if policy.vectorized: self.sampler_cls = OnPolicyVectorizedSampler else: self.sampler_cls = BatchSampler self.max_path_length = inner_algo.max_path_length self._policy = policy self._env = env self._baseline = baseline self._num_grad_updates = num_grad_updates self._meta_batch_size = meta_batch_size self._inner_algo = inner_algo self._inner_optimizer = DifferentiableSGD(self._policy, lr=inner_lr) self._meta_optimizer = make_optimizer(meta_optimizer, policy, lr=_Default(outer_lr), eps=_Default(1e-5))
def __init__( self, env_spec, policy, baseline, optimizer=torch.optim.Adam, policy_lr=3e-4, max_path_length=500, num_train_per_epoch=1, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', minibatch_size=None, max_optimization_epochs=1, **kwargs, ): self._gae_lambda = gae_lambda self._center_adv = center_adv self._positive_adv = positive_adv self._policy_ent_coeff = policy_ent_coeff self._use_softplus_entropy = use_softplus_entropy self._stop_entropy_gradient = stop_entropy_gradient self._entropy_method = entropy_method self._minibatch_size = minibatch_size self._max_optimization_epochs = max_optimization_epochs self._eps = 1e-8 self._maximum_entropy = (entropy_method == 'max') self._entropy_regularzied = (entropy_method == 'regularized') self._check_entropy_configuration(entropy_method, center_adv, stop_entropy_gradient, policy_ent_coeff) self._episode_reward_mean = collections.deque(maxlen=100) self._optimizer = make_optimizer(optimizer, policy, lr=_Default(policy_lr), eps=_Default(1e-5)) super().__init__(env_spec=env_spec, policy=policy, baseline=baseline, discount=discount, max_path_length=max_path_length, n_samples=num_train_per_epoch, **kwargs) self._old_policy = copy.deepcopy(self.policy) ##ADD self.mode ='' self.name = '' self.batch_size = 0 self.initialized = False self.image_trans = None self.initialize()
def __init__(self, env_spec, policy, baseline, optimizer=torch.optim.Adam, policy_lr=_Default(3e-4), max_path_length=500, lr_clip_range=2e-1, num_train_per_epoch=1, discount=0.99, gae_lambda=0.97, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy'): super().__init__(env_spec=env_spec, policy=policy, baseline=baseline, optimizer=optimizer, policy_lr=policy_lr, max_path_length=max_path_length, num_train_per_epoch=num_train_per_epoch, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) self._lr_clip_range = lr_clip_range
def __init__(self, env, policy, value_function, inner_lr=_Default(1e-2), outer_lr=1e-3, max_kl_step=0.01, max_path_length=500, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=40, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): policy_optimizer = OptimizerWrapper( (torch.optim.Adam, dict(lr=inner_lr)), policy) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = VPG(env.spec, policy, value_function, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, max_path_length=max_path_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) meta_optimizer = (ConjugateGradientOptimizer, dict(max_constraint_value=max_kl_step)) super().__init__(inner_algo=inner_algo, env=env, policy=policy, meta_optimizer=meta_optimizer, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates, meta_evaluator=meta_evaluator, evaluate_every_n_epochs=evaluate_every_n_epochs)
def __init__(self, env, policy, baseline, inner_lr=_Default(1e-1), outer_lr=1e-3, lr_clip_range=5e-1, max_path_length=100, discount=0.99, gae_lambda=1.0, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=20, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): inner_algo = PPO(env.spec, policy, baseline, optimizer=torch.optim.Adam, policy_lr=inner_lr, lr_clip_range=lr_clip_range, max_path_length=max_path_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) super().__init__(inner_algo=inner_algo, env=env, policy=policy, baseline=baseline, meta_optimizer=torch.optim.Adam, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates, meta_evaluator=meta_evaluator, evaluate_every_n_epochs=evaluate_every_n_epochs)
def __init__(self, env_spec, policy, qf, replay_buffer, steps_per_epoch=20, n_train_steps=50, max_path_length=None, buffer_batch_size=64, min_buffer_size=int(1e4), rollout_batch_size=1, exploration_strategy=None, target_update_tau=0.01, discount=0.99, policy_weight_decay=0, qf_weight_decay=0, policy_optimizer=torch.optim.Adam, qf_optimizer=torch.optim.Adam, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), clip_pos_returns=False, clip_return=np.inf, max_action=None, reward_scale=1., smooth_return=True): action_bound = env_spec.action_space.high self._tau = target_update_tau self._policy_weight_decay = policy_weight_decay self._qf_weight_decay = qf_weight_decay self._clip_pos_returns = clip_pos_returns self._clip_return = clip_return self._max_action = action_bound if max_action is None else max_action self._evaluate = False self._success_history = deque(maxlen=100) self._episode_rewards = [] self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] super().__init__(env_spec=env_spec, policy=policy, qf=qf, n_train_steps=n_train_steps, steps_per_epoch=steps_per_epoch, max_path_length=max_path_length, buffer_batch_size=buffer_batch_size, min_buffer_size=min_buffer_size, rollout_batch_size=rollout_batch_size, exploration_strategy=exploration_strategy, replay_buffer=replay_buffer, use_target=True, discount=discount, reward_scale=reward_scale, smooth_return=smooth_return) self._target_policy = copy.deepcopy(self.policy) self._target_qf = copy.deepcopy(self.qf) self._policy_optimizer = make_optimizer(policy_optimizer, self.policy, lr=policy_lr) self._qf_optimizer = make_optimizer(qf_optimizer, self.qf, lr=qf_lr)
def __init__( self, env_spec, policy, baseline, optimizer=torch.optim.Adam, baseline_optimizer=torch.optim.Adam, optimization_n_minibatches=1, optimization_mini_epochs=1, policy_lr=_Default(1e-2), lr_clip_range=2e-1, max_path_length=500, num_train_per_epoch=1, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', clip_grad_norm=None, ): self._gae_lambda = gae_lambda self._center_adv = center_adv self._positive_adv = positive_adv self._policy_ent_coeff = policy_ent_coeff self._use_softplus_entropy = use_softplus_entropy self._stop_entropy_gradient = stop_entropy_gradient self._entropy_method = entropy_method self._lr_clip_range = lr_clip_range self._eps = 1e-8 self._maximum_entropy = (entropy_method == 'max') self._entropy_regularzied = (entropy_method == 'regularized') self._check_entropy_configuration(entropy_method, center_adv, stop_entropy_gradient, policy_ent_coeff) self._episode_reward_mean = collections.deque(maxlen=100) self._optimizer = make_optimizer(optimizer, policy, lr=policy_lr, eps=_Default(1e-5)) if not isinstance(baseline, LinearFeatureBaseline): self._baseline_optimizer = make_optimizer(baseline_optimizer, baseline, lr=policy_lr, eps=_Default(1e-5)) self._optimization_n_minibatches = optimization_n_minibatches self._optimization_mini_epochs = optimization_mini_epochs self._clip_grad_norm = clip_grad_norm super().__init__(env_spec=env_spec, policy=policy, baseline=baseline, discount=discount, max_path_length=max_path_length, n_samples=num_train_per_epoch) self._old_policy = copy.deepcopy(self.policy)