def __init__(self, inner_algo, env, policy, meta_optimizer, meta_batch_size=40, inner_lr=0.1, outer_lr=1e-3, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): if policy.vectorized: self.sampler_cls = OnPolicyVectorizedSampler else: self.sampler_cls = BatchSampler self.max_path_length = inner_algo.max_path_length self._meta_evaluator = meta_evaluator self._policy = policy self._env = env self._value_function = copy.deepcopy(inner_algo._value_function) self._initial_vf_state = self._value_function.state_dict() self._num_grad_updates = num_grad_updates self._meta_batch_size = meta_batch_size self._inner_algo = inner_algo self._inner_optimizer = DifferentiableSGD(self._policy, lr=inner_lr) self._meta_optimizer = make_optimizer(meta_optimizer, module=policy, lr=_Default(outer_lr), eps=_Default(1e-5)) self._evaluate_every_n_epochs = evaluate_every_n_epochs
def __init__(self, optimizer=None, learning_rate=None, max_epochs=1000, tolerance=1e-6, batch_size=32, callback=None, verbose=False, name='FirstOrderOptimizer'): self._opt_fun = None self._target = None self._callback = callback if optimizer is None: optimizer = tf.compat.v1.train.AdamOptimizer learning_rate = learning_rate or dict(learning_rate=_Default(1e-3)) if not isinstance(learning_rate, dict): learning_rate = dict(learning_rate=learning_rate) self._tf_optimizer = optimizer self._learning_rate = learning_rate self._max_epochs = max_epochs self._tolerance = tolerance self._batch_size = batch_size self._verbose = verbose self._input_vars = None self._train_op = None self._name = name
def __init__(self, env_spec, policy, baseline, max_path_length=500, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, fixed_horizon=False, epsilon=0.5, l2_reg_dual=0., l2_reg_loss=0., optimizer=LbfgsOptimizer, optimizer_args=None, dual_optimizer=scipy.optimize.fmin_l_bfgs_b, dual_optimizer_args=None, name='REPS'): optimizer_args = optimizer_args or dict(max_opt_itr=_Default(50)) dual_optimizer_args = dual_optimizer_args or dict(maxiter=50) self.policy = policy self.max_path_length = max_path_length self._env_spec = env_spec self._baseline = baseline self._discount = discount self._gae_lambda = gae_lambda self._center_adv = center_adv self._positive_adv = positive_adv self._fixed_horizon = fixed_horizon self._flatten_input = True self._name = name self._name_scope = tf.name_scope(self._name) self._old_policy = policy.clone('old_policy') self._feat_diff = None self._param_eta = None self._param_v = None self._f_dual = None self._f_dual_grad = None self._f_policy_kl = None self._optimizer = make_optimizer(optimizer, **optimizer_args) self._dual_optimizer = dual_optimizer self._dual_optimizer_args = dual_optimizer_args self._epsilon = float(epsilon) self._l2_reg_dual = float(l2_reg_dual) self._l2_reg_loss = float(l2_reg_loss) self._episode_reward_mean = collections.deque(maxlen=100) if policy.vectorized: self.sampler_cls = OnPolicyVectorizedSampler else: self.sampler_cls = BatchSampler self.init_opt()
def __init__(self, env, policy, value_function, inner_lr=_Default(1e-2), outer_lr=1e-3, max_kl_step=0.01, max_path_length=500, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=40, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): policy_optimizer = OptimizerWrapper( (torch.optim.Adam, dict(lr=inner_lr)), policy) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = VPG(env.spec, policy, value_function, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, max_path_length=max_path_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) meta_optimizer = (ConjugateGradientOptimizer, dict(max_constraint_value=max_kl_step)) super().__init__(inner_algo=inner_algo, env=env, policy=policy, meta_optimizer=meta_optimizer, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates, meta_evaluator=meta_evaluator, evaluate_every_n_epochs=evaluate_every_n_epochs)
def __init__(self, env_spec, policy, qf, replay_buffer, exploration_policy=None, steps_per_epoch=20, min_buffer_size=int(1e4), buffer_batch_size=64, rollout_batch_size=1, n_train_steps=50, max_path_length=None, qf_lr=_Default(0.001), qf_optimizer=tf.compat.v1.train.AdamOptimizer, discount=1.0, target_network_update_freq=5, grad_norm_clipping=None, double_q=False, reward_scale=1., smooth_return=True, name='DQN'): self._qf_optimizer = qf_optimizer self._qf_lr = qf_lr self._name = name self._target_network_update_freq = target_network_update_freq self._grad_norm_clipping = grad_norm_clipping self._double_q = double_q # clone a target q-function self._target_qf = qf.clone('target_qf') super(DQN, self).__init__(env_spec=env_spec, policy=policy, qf=qf, exploration_policy=exploration_policy, min_buffer_size=min_buffer_size, n_train_steps=n_train_steps, steps_per_epoch=steps_per_epoch, buffer_batch_size=buffer_batch_size, rollout_batch_size=rollout_batch_size, replay_buffer=replay_buffer, max_path_length=max_path_length, discount=discount, reward_scale=reward_scale, smooth_return=smooth_return)
def __init__( self, env_spec, policy, qf, replay_buffer, *, # Everything after this is numbers. steps_per_epoch=20, n_train_steps=50, max_path_length=None, max_eval_path_length=None, buffer_batch_size=64, min_buffer_size=int(1e4), rollout_batch_size=1, exploration_policy=None, target_update_tau=0.01, discount=0.99, policy_weight_decay=0, qf_weight_decay=0, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), clip_pos_returns=False, clip_return=np.inf, max_action=None, reward_scale=1., smooth_return=True, name='DDPG'): action_bound = env_spec.action_space.high self._max_action = action_bound if max_action is None else max_action self._tau = target_update_tau self._policy_weight_decay = policy_weight_decay self._qf_weight_decay = qf_weight_decay self._name = name self._clip_pos_returns = clip_pos_returns self._clip_return = clip_return self._success_history = deque(maxlen=100) self._episode_rewards = [] self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] self._target_policy = policy.clone('target_policy') self._target_qf = qf.clone('target_qf') self._policy_optimizer = policy_optimizer self._qf_optimizer = qf_optimizer self._policy_lr = policy_lr self._qf_lr = qf_lr super(DDPG, self).__init__(env_spec=env_spec, policy=policy, qf=qf, n_train_steps=n_train_steps, steps_per_epoch=steps_per_epoch, max_path_length=max_path_length, max_eval_path_length=max_eval_path_length, buffer_batch_size=buffer_batch_size, min_buffer_size=min_buffer_size, rollout_batch_size=rollout_batch_size, exploration_policy=exploration_policy, replay_buffer=replay_buffer, use_target=True, discount=discount, reward_scale=reward_scale, smooth_return=smooth_return)