コード例 #1
0
    def __init__(self,
                 inner_algo,
                 env,
                 policy,
                 meta_optimizer,
                 meta_batch_size=40,
                 inner_lr=0.1,
                 outer_lr=1e-3,
                 num_grad_updates=1,
                 meta_evaluator=None,
                 evaluate_every_n_epochs=1):
        if policy.vectorized:
            self.sampler_cls = OnPolicyVectorizedSampler
        else:
            self.sampler_cls = BatchSampler

        self.max_path_length = inner_algo.max_path_length

        self._meta_evaluator = meta_evaluator
        self._policy = policy
        self._env = env
        self._value_function = copy.deepcopy(inner_algo._value_function)
        self._initial_vf_state = self._value_function.state_dict()
        self._num_grad_updates = num_grad_updates
        self._meta_batch_size = meta_batch_size
        self._inner_algo = inner_algo
        self._inner_optimizer = DifferentiableSGD(self._policy, lr=inner_lr)
        self._meta_optimizer = make_optimizer(meta_optimizer,
                                              module=policy,
                                              lr=_Default(outer_lr),
                                              eps=_Default(1e-5))
        self._evaluate_every_n_epochs = evaluate_every_n_epochs
コード例 #2
0
    def __init__(self,
                 optimizer=None,
                 learning_rate=None,
                 max_epochs=1000,
                 tolerance=1e-6,
                 batch_size=32,
                 callback=None,
                 verbose=False,
                 name='FirstOrderOptimizer'):
        self._opt_fun = None
        self._target = None
        self._callback = callback
        if optimizer is None:
            optimizer = tf.compat.v1.train.AdamOptimizer
        learning_rate = learning_rate or dict(learning_rate=_Default(1e-3))
        if not isinstance(learning_rate, dict):
            learning_rate = dict(learning_rate=learning_rate)

        self._tf_optimizer = optimizer
        self._learning_rate = learning_rate
        self._max_epochs = max_epochs
        self._tolerance = tolerance
        self._batch_size = batch_size
        self._verbose = verbose
        self._input_vars = None
        self._train_op = None
        self._name = name
コード例 #3
0
    def __init__(self,
                 env_spec,
                 policy,
                 baseline,
                 max_path_length=500,
                 discount=0.99,
                 gae_lambda=1,
                 center_adv=True,
                 positive_adv=False,
                 fixed_horizon=False,
                 epsilon=0.5,
                 l2_reg_dual=0.,
                 l2_reg_loss=0.,
                 optimizer=LbfgsOptimizer,
                 optimizer_args=None,
                 dual_optimizer=scipy.optimize.fmin_l_bfgs_b,
                 dual_optimizer_args=None,
                 name='REPS'):
        optimizer_args = optimizer_args or dict(max_opt_itr=_Default(50))
        dual_optimizer_args = dual_optimizer_args or dict(maxiter=50)

        self.policy = policy
        self.max_path_length = max_path_length

        self._env_spec = env_spec
        self._baseline = baseline
        self._discount = discount
        self._gae_lambda = gae_lambda
        self._center_adv = center_adv
        self._positive_adv = positive_adv
        self._fixed_horizon = fixed_horizon
        self._flatten_input = True

        self._name = name
        self._name_scope = tf.name_scope(self._name)
        self._old_policy = policy.clone('old_policy')

        self._feat_diff = None
        self._param_eta = None
        self._param_v = None
        self._f_dual = None
        self._f_dual_grad = None
        self._f_policy_kl = None

        self._optimizer = make_optimizer(optimizer, **optimizer_args)
        self._dual_optimizer = dual_optimizer
        self._dual_optimizer_args = dual_optimizer_args
        self._epsilon = float(epsilon)
        self._l2_reg_dual = float(l2_reg_dual)
        self._l2_reg_loss = float(l2_reg_loss)

        self._episode_reward_mean = collections.deque(maxlen=100)
        if policy.vectorized:
            self.sampler_cls = OnPolicyVectorizedSampler
        else:
            self.sampler_cls = BatchSampler

        self.init_opt()
コード例 #4
0
    def __init__(self,
                 env,
                 policy,
                 value_function,
                 inner_lr=_Default(1e-2),
                 outer_lr=1e-3,
                 max_kl_step=0.01,
                 max_path_length=500,
                 discount=0.99,
                 gae_lambda=1,
                 center_adv=True,
                 positive_adv=False,
                 policy_ent_coeff=0.0,
                 use_softplus_entropy=False,
                 stop_entropy_gradient=False,
                 entropy_method='no_entropy',
                 meta_batch_size=40,
                 num_grad_updates=1,
                 meta_evaluator=None,
                 evaluate_every_n_epochs=1):

        policy_optimizer = OptimizerWrapper(
            (torch.optim.Adam, dict(lr=inner_lr)), policy)
        vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)),
                                        value_function)

        inner_algo = VPG(env.spec,
                         policy,
                         value_function,
                         policy_optimizer=policy_optimizer,
                         vf_optimizer=vf_optimizer,
                         max_path_length=max_path_length,
                         num_train_per_epoch=1,
                         discount=discount,
                         gae_lambda=gae_lambda,
                         center_adv=center_adv,
                         positive_adv=positive_adv,
                         policy_ent_coeff=policy_ent_coeff,
                         use_softplus_entropy=use_softplus_entropy,
                         stop_entropy_gradient=stop_entropy_gradient,
                         entropy_method=entropy_method)

        meta_optimizer = (ConjugateGradientOptimizer,
                          dict(max_constraint_value=max_kl_step))

        super().__init__(inner_algo=inner_algo,
                         env=env,
                         policy=policy,
                         meta_optimizer=meta_optimizer,
                         meta_batch_size=meta_batch_size,
                         inner_lr=inner_lr,
                         outer_lr=outer_lr,
                         num_grad_updates=num_grad_updates,
                         meta_evaluator=meta_evaluator,
                         evaluate_every_n_epochs=evaluate_every_n_epochs)
コード例 #5
0
    def __init__(self,
                 env_spec,
                 policy,
                 qf,
                 replay_buffer,
                 exploration_policy=None,
                 steps_per_epoch=20,
                 min_buffer_size=int(1e4),
                 buffer_batch_size=64,
                 rollout_batch_size=1,
                 n_train_steps=50,
                 max_path_length=None,
                 qf_lr=_Default(0.001),
                 qf_optimizer=tf.compat.v1.train.AdamOptimizer,
                 discount=1.0,
                 target_network_update_freq=5,
                 grad_norm_clipping=None,
                 double_q=False,
                 reward_scale=1.,
                 smooth_return=True,
                 name='DQN'):
        self._qf_optimizer = qf_optimizer
        self._qf_lr = qf_lr
        self._name = name
        self._target_network_update_freq = target_network_update_freq
        self._grad_norm_clipping = grad_norm_clipping
        self._double_q = double_q

        # clone a target q-function
        self._target_qf = qf.clone('target_qf')

        super(DQN, self).__init__(env_spec=env_spec,
                                  policy=policy,
                                  qf=qf,
                                  exploration_policy=exploration_policy,
                                  min_buffer_size=min_buffer_size,
                                  n_train_steps=n_train_steps,
                                  steps_per_epoch=steps_per_epoch,
                                  buffer_batch_size=buffer_batch_size,
                                  rollout_batch_size=rollout_batch_size,
                                  replay_buffer=replay_buffer,
                                  max_path_length=max_path_length,
                                  discount=discount,
                                  reward_scale=reward_scale,
                                  smooth_return=smooth_return)
コード例 #6
0
    def __init__(
            self,
            env_spec,
            policy,
            qf,
            replay_buffer,
            *,  # Everything after this is numbers.
            steps_per_epoch=20,
            n_train_steps=50,
            max_path_length=None,
            max_eval_path_length=None,
            buffer_batch_size=64,
            min_buffer_size=int(1e4),
            rollout_batch_size=1,
            exploration_policy=None,
            target_update_tau=0.01,
            discount=0.99,
            policy_weight_decay=0,
            qf_weight_decay=0,
            policy_optimizer=tf.compat.v1.train.AdamOptimizer,
            qf_optimizer=tf.compat.v1.train.AdamOptimizer,
            policy_lr=_Default(1e-4),
            qf_lr=_Default(1e-3),
            clip_pos_returns=False,
            clip_return=np.inf,
            max_action=None,
            reward_scale=1.,
            smooth_return=True,
            name='DDPG'):
        action_bound = env_spec.action_space.high
        self._max_action = action_bound if max_action is None else max_action
        self._tau = target_update_tau
        self._policy_weight_decay = policy_weight_decay
        self._qf_weight_decay = qf_weight_decay
        self._name = name
        self._clip_pos_returns = clip_pos_returns
        self._clip_return = clip_return
        self._success_history = deque(maxlen=100)

        self._episode_rewards = []
        self._episode_policy_losses = []
        self._episode_qf_losses = []
        self._epoch_ys = []
        self._epoch_qs = []

        self._target_policy = policy.clone('target_policy')
        self._target_qf = qf.clone('target_qf')
        self._policy_optimizer = policy_optimizer
        self._qf_optimizer = qf_optimizer
        self._policy_lr = policy_lr
        self._qf_lr = qf_lr

        super(DDPG, self).__init__(env_spec=env_spec,
                                   policy=policy,
                                   qf=qf,
                                   n_train_steps=n_train_steps,
                                   steps_per_epoch=steps_per_epoch,
                                   max_path_length=max_path_length,
                                   max_eval_path_length=max_eval_path_length,
                                   buffer_batch_size=buffer_batch_size,
                                   min_buffer_size=min_buffer_size,
                                   rollout_batch_size=rollout_batch_size,
                                   exploration_policy=exploration_policy,
                                   replay_buffer=replay_buffer,
                                   use_target=True,
                                   discount=discount,
                                   reward_scale=reward_scale,
                                   smooth_return=smooth_return)