def __init__(self, inner_algo, env, policy, sampler, task_sampler, meta_optimizer, meta_batch_size=40, inner_lr=0.1, outer_lr=1e-3, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): self._sampler = sampler self.max_episode_length = inner_algo.max_episode_length self._meta_evaluator = meta_evaluator self._policy = policy self._env = env self._task_sampler = task_sampler self._value_function = copy.deepcopy(inner_algo._value_function) self._initial_vf_state = self._value_function.state_dict() self._num_grad_updates = num_grad_updates self._meta_batch_size = meta_batch_size self._inner_algo = inner_algo self._inner_optimizer = DifferentiableSGD(self._policy, lr=inner_lr) self._meta_optimizer = make_optimizer(meta_optimizer, module=policy, lr=_Default(outer_lr), eps=_Default(1e-5)) self._evaluate_every_n_epochs = evaluate_every_n_epochs
def __init__(self, optimizer=None, learning_rate=None, max_optimization_epochs=1000, tolerance=1e-6, batch_size=32, callback=None, verbose=False, name='FirstOrderOptimizer'): self._opt_fun = None self._target = None self._callback = callback if optimizer is None: optimizer = tf.compat.v1.train.AdamOptimizer learning_rate = learning_rate or dict(learning_rate=_Default(1e-3)) if not isinstance(learning_rate, dict): learning_rate = dict(learning_rate=learning_rate) self._tf_optimizer = optimizer self._learning_rate = learning_rate self._max_optimization_epochs = max_optimization_epochs self._tolerance = tolerance self._batch_size = batch_size self._verbose = verbose self._input_vars = None self._train_op = None self._name = name
def __init__(self, env, policy, value_function, sampler, task_sampler, inner_lr=_Default(1e-2), outer_lr=1e-3, max_kl_step=0.01, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=40, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): policy_optimizer = OptimizerWrapper( (torch.optim.Adam, dict(lr=inner_lr)), policy) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = VPG(env.spec, policy, value_function, None, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) meta_optimizer = (ConjugateGradientOptimizer, dict(max_constraint_value=max_kl_step)) super().__init__(inner_algo=inner_algo, env=env, policy=policy, sampler=sampler, task_sampler=task_sampler, meta_optimizer=meta_optimizer, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates, meta_evaluator=meta_evaluator, evaluate_every_n_epochs=evaluate_every_n_epochs)
def __init__(self, env_spec, policy, baseline, max_path_length=500, discount=0.99, gae_lambda=1, center_adv=True, positive_adv=False, fixed_horizon=False, epsilon=0.5, l2_reg_dual=0., l2_reg_loss=0., optimizer=LbfgsOptimizer, optimizer_args=None, dual_optimizer=scipy.optimize.fmin_l_bfgs_b, dual_optimizer_args=None, name='REPS'): optimizer_args = optimizer_args or dict(max_opt_itr=_Default(50)) dual_optimizer_args = dual_optimizer_args or dict(maxiter=50) self.policy = policy self.max_path_length = max_path_length self._env_spec = env_spec self._baseline = baseline self._discount = discount self._gae_lambda = gae_lambda self._center_adv = center_adv self._positive_adv = positive_adv self._fixed_horizon = fixed_horizon self._name = name self._name_scope = tf.name_scope(self._name) self._old_policy = policy.clone('old_policy') self._old_policy.parameters = self.policy.parameters self._feat_diff = None self._param_eta = None self._param_v = None self._f_dual = None self._f_dual_grad = None self._f_policy_kl = None self._policy_network = None self._old_policy_network = None self._optimizer = make_optimizer(optimizer, **optimizer_args) self._dual_optimizer = dual_optimizer self._dual_optimizer_args = dual_optimizer_args self._epsilon = float(epsilon) self._l2_reg_dual = float(l2_reg_dual) self._l2_reg_loss = float(l2_reg_loss) self._episode_reward_mean = collections.deque(maxlen=100) self.sampler_cls = RaySampler self.init_opt()
def __init__(self, env_spec, policy, qf, replay_buffer, exploration_policy=None, steps_per_epoch=20, min_buffer_size=int(1e4), buffer_batch_size=64, rollout_batch_size=1, n_train_steps=50, max_path_length=None, max_eval_path_length=None, qf_lr=_Default(0.001), qf_optimizer=tf.compat.v1.train.AdamOptimizer, discount=1.0, target_network_update_freq=5, grad_norm_clipping=None, double_q=False, reward_scale=1., smooth_return=True, name='DQN'): self._qf_optimizer = qf_optimizer self._qf_lr = qf_lr self._name = name self._target_network_update_freq = target_network_update_freq self._grad_norm_clipping = grad_norm_clipping self._double_q = double_q # clone a target q-function self._target_qf = qf.clone('target_qf') self._min_buffer_size = min_buffer_size self._qf = qf self._steps_per_epoch = steps_per_epoch self._n_train_steps = n_train_steps self._buffer_batch_size = buffer_batch_size self._discount = discount self._reward_scale = reward_scale self._smooth_return = smooth_return self.max_path_length = max_path_length self._max_eval_path_length = max_eval_path_length # used by OffPolicyVectorizedSampler self.env_spec = env_spec self.rollout_batch_size = rollout_batch_size self.replay_buffer = replay_buffer self.policy = policy self.exploration_policy = exploration_policy self.sampler_cls = OffPolicyVectorizedSampler self.init_opt()
def __init__(self, env, policy, value_function, inner_lr=_Default(1e-1), outer_lr=1e-3, lr_clip_range=5e-1, max_episode_length=100, discount=0.99, gae_lambda=1.0, center_adv=True, positive_adv=False, policy_ent_coeff=0.0, use_softplus_entropy=False, stop_entropy_gradient=False, entropy_method='no_entropy', meta_batch_size=20, num_grad_updates=1, meta_evaluator=None, evaluate_every_n_epochs=1): policy_optimizer = OptimizerWrapper( (torch.optim.Adam, dict(lr=inner_lr)), policy) vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=inner_lr)), value_function) inner_algo = PPO(env.spec, policy, value_function, policy_optimizer=policy_optimizer, vf_optimizer=vf_optimizer, lr_clip_range=lr_clip_range, max_episode_length=max_episode_length, num_train_per_epoch=1, discount=discount, gae_lambda=gae_lambda, center_adv=center_adv, positive_adv=positive_adv, policy_ent_coeff=policy_ent_coeff, use_softplus_entropy=use_softplus_entropy, stop_entropy_gradient=stop_entropy_gradient, entropy_method=entropy_method) super().__init__(inner_algo=inner_algo, env=env, policy=policy, meta_optimizer=torch.optim.Adam, meta_batch_size=meta_batch_size, inner_lr=inner_lr, outer_lr=outer_lr, num_grad_updates=num_grad_updates, meta_evaluator=meta_evaluator, evaluate_every_n_epochs=evaluate_every_n_epochs)
def __init__(self, env_spec, policy, qf, replay_buffer, exploration_policy=None, steps_per_epoch=20, min_buffer_size=int(1e4), buffer_batch_size=64, max_episode_length_eval=None, n_train_steps=50, qf_lr=_Default(0.001), qf_optimizer=tf.compat.v1.train.AdamOptimizer, discount=1.0, target_network_update_freq=5, grad_norm_clipping=None, double_q=False, reward_scale=1., name='DQN'): self._qf_optimizer = qf_optimizer self._qf_lr = qf_lr self._name = name self._target_network_update_freq = target_network_update_freq self._grad_norm_clipping = grad_norm_clipping self._double_q = double_q # clone a target q-function self._target_qf = qf.clone('target_qf') self._min_buffer_size = min_buffer_size self._qf = qf self._steps_per_epoch = steps_per_epoch self._n_train_steps = n_train_steps self._buffer_batch_size = buffer_batch_size self._discount = discount self._reward_scale = reward_scale self.max_episode_length = env_spec.max_episode_length self._max_episode_length_eval = env_spec.max_episode_length if max_episode_length_eval is not None: self._max_episode_length_eval = max_episode_length_eval self._eval_env = None self.env_spec = env_spec self.replay_buffer = replay_buffer self.policy = policy self.exploration_policy = exploration_policy self.sampler_cls = LocalSampler self.worker_cls = FragmentWorker self.init_opt()
def __init__(self, env_spec, policy, qf, replay_buffer, exploration_policy=None, steps_per_epoch=20, min_buffer_size=int(1e4), buffer_batch_size=64, rollout_batch_size=1, n_train_steps=50, max_path_length=None, qf_lr=_Default(0.001), qf_optimizer=tf.compat.v1.train.AdamOptimizer, discount=1.0, target_network_update_freq=5, grad_norm_clipping=None, double_q=False, reward_scale=1., smooth_return=True, name='DQN'): self._qf_optimizer = qf_optimizer self._qf_lr = qf_lr self._name = name self._target_network_update_freq = target_network_update_freq self._grad_norm_clipping = grad_norm_clipping self._double_q = double_q # clone a target q-function self._target_qf = qf.clone('target_qf') super(DQN, self).__init__(env_spec=env_spec, policy=policy, qf=qf, exploration_policy=exploration_policy, min_buffer_size=min_buffer_size, n_train_steps=n_train_steps, steps_per_epoch=steps_per_epoch, buffer_batch_size=buffer_batch_size, rollout_batch_size=rollout_batch_size, replay_buffer=replay_buffer, max_path_length=max_path_length, discount=discount, reward_scale=reward_scale, smooth_return=smooth_return)
def __init__( self, env_spec, learner, *, batch_size, source=None, sampler=None, policy_optimizer=torch.optim.Adam, policy_lr=_Default(1e-3), loss='log_prob', minibatches_per_epoch=16, name='BC', ): self._source = source self.learner = learner self._optimizer = make_optimizer(policy_optimizer, module=self.learner, lr=policy_lr) if loss not in ('log_prob', 'mse'): raise ValueError('Loss should be either "log_prob" or "mse".') self._loss = loss self._minibatches_per_epoch = minibatches_per_epoch self._eval_env = None self._batch_size = batch_size self._name = name # For plotting self.policy = self.learner # Public fields for sampling. self._env_spec = env_spec self.exploration_policy = None self.policy = None self.max_episode_length = env_spec.max_episode_length self._sampler = sampler if isinstance(self._source, Policy): self.exploration_policy = self._source self._source = source if not isinstance(self._sampler, Sampler): raise TypeError('Source is a policy. Missing a sampler.') else: self._source = itertools.cycle(iter(source))
def __init__( self, env_spec, learner, *, batch_size, source=None, max_path_length=None, policy_optimizer=torch.optim.Adam, policy_lr=_Default(1e-3), loss='log_prob', minibatches_per_epoch=16, name='BC', ): self._source = source self.learner = learner self._optimizer = make_optimizer(policy_optimizer, module=self.learner, lr=policy_lr) if loss not in ('log_prob', 'mse'): raise ValueError('Loss should be either "log_prob" or "mse".') self._loss = loss self._minibatches_per_epoch = minibatches_per_epoch self._eval_env = None self._batch_size = batch_size self._name = name # Public fields for sampling. self.env_spec = env_spec self.policy = None self.max_path_length = max_path_length self.sampler_cls = None if isinstance(self._source, Policy): if max_path_length is None: raise ValueError('max_path_length must be passed if the ' 'source is a policy') self.policy = self._source self.sampler_cls = RaySampler self._source = source else: self._source = itertools.cycle(iter(source))
def __init__( self, env_spec, policy, qf, qf2, replay_buffer, *, # Everything after this is numbers. target_update_tau=0.01, policy_weight_decay=0, qf_weight_decay=0, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), clip_pos_returns=False, clip_return=np.inf, discount=0.99, max_episode_length_eval=None, max_action=None, name='TD3', steps_per_epoch=20, n_train_steps=50, buffer_batch_size=64, min_buffer_size=1e4, reward_scale=1., exploration_policy_sigma=0.2, actor_update_period=2, exploration_policy_clip=0.5, exploration_policy=None): action_bound = env_spec.action_space.high self._max_action = action_bound if max_action is None else max_action self._tau = target_update_tau self._policy_weight_decay = policy_weight_decay self._qf_weight_decay = qf_weight_decay self._name = name self._clip_pos_returns = clip_pos_returns self._clip_return = clip_return self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] self._target_policy = policy.clone('target_policy') self._target_qf = qf.clone('target_qf') self.qf2 = qf2 self.qf = qf self._exploration_policy_sigma = exploration_policy_sigma self._exploration_policy_clip = exploration_policy_clip self._actor_update_period = actor_update_period self._action_loss = None self._target_qf2 = qf2.clone('target_qf2') self._policy_optimizer = policy_optimizer self._qf_optimizer = qf_optimizer self._policy_lr = policy_lr self._qf_lr = qf_lr self._policy = policy self._n_train_steps = n_train_steps self._min_buffer_size = min_buffer_size self._qf = qf self._steps_per_epoch = steps_per_epoch self._n_train_steps = n_train_steps self._buffer_batch_size = buffer_batch_size self._discount = discount self._reward_scale = reward_scale self.max_episode_length = env_spec.max_episode_length self._max_episode_length_eval = env_spec.max_episode_length if max_episode_length_eval is not None: self._max_episode_length_eval = max_episode_length_eval self._eval_env = None self._env_spec = env_spec self._replay_buffer = replay_buffer self.policy = policy self.exploration_policy = exploration_policy self.sampler_cls = LocalSampler self.worker_cls = FragmentWorker self._init_opt()
def __init__( self, env_spec, policy, qf1, qf2, replay_buffer, *, # Everything after this is numbers. max_episode_length_eval=None, grad_steps_per_env_step, exploration_policy, uniform_random_policy=None, max_action=None, target_update_tau=0.005, discount=0.99, reward_scaling=1., update_actor_interval=2, buffer_batch_size=64, replay_buffer_size=1e6, min_buffer_size=1e4, exploration_noise=0.1, policy_noise=0.2, policy_noise_clip=0.5, clip_return=np.inf, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), policy_optimizer=torch.optim.Adam, qf_optimizer=torch.optim.Adam, num_evaluation_episodes=10, steps_per_epoch=20, start_steps=10000, update_after=1000, use_deterministic_evaluation=False): self._env_spec = env_spec action_bound = self._env_spec.action_space.high[0] self._max_action = action_bound if max_action is None else max_action self._action_dim = self._env_spec.action_space.shape[0] self._tau = target_update_tau self._discount = discount self._reward_scaling = reward_scaling self._exploration_noise = exploration_noise self._policy_noise = policy_noise self._policy_noise_clip = policy_noise_clip self._clip_return = clip_return self._replay_buffer_size = replay_buffer_size self._min_buffer_size = min_buffer_size self._buffer_batch_size = buffer_batch_size self._grad_steps_per_env_step = grad_steps_per_env_step self._update_actor_interval = update_actor_interval self._steps_per_epoch = steps_per_epoch self._start_steps = start_steps self._update_after = update_after self._num_evaluation_episodes = num_evaluation_episodes self.max_episode_length = env_spec.max_episode_length self._max_episode_length_eval = env_spec.max_episode_length if max_episode_length_eval is not None: self._max_episode_length_eval = max_episode_length_eval self._use_deterministic_evaluation = use_deterministic_evaluation self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] self._eval_env = None self.exploration_policy = exploration_policy self._uniform_random_policy = uniform_random_policy self.worker_cls = FragmentWorker self.sampler_cls = LocalSampler self._replay_buffer = replay_buffer self.policy = policy self._qf_1 = qf1 self._qf_2 = qf2 self._target_policy = copy.deepcopy(self.policy) self._target_qf_1 = copy.deepcopy(self._qf_1) self._target_qf_2 = copy.deepcopy(self._qf_2) self._policy_optimizer = make_optimizer(policy_optimizer, module=self.policy, lr=policy_lr) self._qf_optimizer_1 = make_optimizer(qf_optimizer, module=self._qf_1, lr=qf_lr) self._qf_optimizer_2 = make_optimizer(qf_optimizer, module=self._qf_2, lr=qf_lr) self._actor_loss = torch.zeros(1)
def __init__( self, env_spec, policy, qf, qf2, replay_buffer, *, # Everything after this is numbers. target_update_tau=0.01, policy_weight_decay=0, qf_weight_decay=0, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), clip_pos_returns=False, clip_return=np.inf, discount=0.99, max_action=None, name='TD3', steps_per_epoch=20, max_path_length=None, max_eval_path_length=None, n_train_steps=50, buffer_batch_size=64, min_buffer_size=1e4, rollout_batch_size=1, reward_scale=1., exploration_policy_sigma=0.2, actor_update_period=2, exploration_policy_clip=0.5, smooth_return=True, exploration_policy=None): action_bound = env_spec.action_space.high self._max_action = action_bound if max_action is None else max_action self._tau = target_update_tau self._policy_weight_decay = policy_weight_decay self._qf_weight_decay = qf_weight_decay self._name = name self._clip_pos_returns = clip_pos_returns self._clip_return = clip_return self._success_history = deque(maxlen=100) self._episode_rewards = [] self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] self._target_policy = policy.clone('target_policy') self._target_qf = qf.clone('target_qf') self.qf2 = qf2 self._exploration_policy_sigma = exploration_policy_sigma self._exploration_policy_clip = exploration_policy_clip self._actor_update_period = actor_update_period self._action_loss = None self._target_qf2 = qf2.clone('target_qf2') self._policy_optimizer = policy_optimizer self._qf_optimizer = qf_optimizer self._policy_lr = policy_lr self._qf_lr = qf_lr super(TD3, self).__init__(env_spec=env_spec, policy=policy, qf=qf, replay_buffer=replay_buffer, discount=discount, steps_per_epoch=steps_per_epoch, max_path_length=max_path_length, max_eval_path_length=max_eval_path_length, n_train_steps=n_train_steps, buffer_batch_size=buffer_batch_size, min_buffer_size=min_buffer_size, rollout_batch_size=rollout_batch_size, reward_scale=reward_scale, smooth_return=smooth_return, exploration_policy=exploration_policy)
def __init__( self, env_spec, policy, qf, replay_buffer, sampler, exploration_policy=None, eval_env=None, double_q=True, qf_optimizer=torch.optim.Adam, *, # Everything after this is numbers. steps_per_epoch=20, n_train_steps=50, max_episode_length_eval=None, deterministic_eval=False, buffer_batch_size=64, min_buffer_size=int(1e4), num_eval_episodes=10, discount=0.99, qf_lr=_Default(1e-3), clip_rewards=None, clip_gradient=10, target_update_freq=5, reward_scale=1.): self._clip_reward = clip_rewards self._clip_grad = clip_gradient self._steps_per_epoch = steps_per_epoch self._target_update_freq = target_update_freq self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] self._policy = policy self._qf = qf self._n_train_steps = n_train_steps self._min_buffer_size = min_buffer_size self._qf = qf self._steps_per_epoch = steps_per_epoch self._n_train_steps = n_train_steps self._buffer_batch_size = buffer_batch_size self._double_q = double_q self._discount = discount self._reward_scale = reward_scale self.max_episode_length = env_spec.max_episode_length self._max_episode_length_eval = (max_episode_length_eval or self.max_episode_length) self._episode_reward_mean = collections.deque(maxlen=100) self._num_eval_episodes = num_eval_episodes self._deterministic_eval = deterministic_eval self.env_spec = env_spec self.replay_buffer = replay_buffer self.policy = policy self.exploration_policy = exploration_policy self._target_qf = copy.deepcopy(self._qf) self._qf_optimizer = make_optimizer(qf_optimizer, module=self._qf, lr=qf_lr) self._eval_env = eval_env self._sampler = sampler
def __init__( self, env_spec, policy, qf, replay_buffer, sampler, *, # Everything after this is numbers. steps_per_epoch=20, n_train_steps=50, buffer_batch_size=64, min_buffer_size=int(1e4), max_episode_length_eval=None, exploration_policy=None, target_update_tau=0.01, discount=0.99, policy_weight_decay=0, qf_weight_decay=0, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), clip_pos_returns=False, clip_return=np.inf, max_action=None, reward_scale=1., name='DDPG'): action_bound = env_spec.action_space.high self._max_action = action_bound if max_action is None else max_action self._tau = target_update_tau self._policy_weight_decay = policy_weight_decay self._qf_weight_decay = qf_weight_decay self._name = name self._clip_pos_returns = clip_pos_returns self._clip_return = clip_return self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] self._target_policy = policy.clone('target_policy') self._target_qf = qf.clone('target_qf') self._policy_optimizer = policy_optimizer self._qf_optimizer = qf_optimizer self._policy_lr = policy_lr self._qf_lr = qf_lr self._min_buffer_size = min_buffer_size self._qf = qf self._steps_per_epoch = steps_per_epoch self._n_train_steps = n_train_steps self._buffer_batch_size = buffer_batch_size self._discount = discount self._reward_scale = reward_scale self.max_episode_length = env_spec.max_episode_length self._max_episode_length_eval = max_episode_length_eval if max_episode_length_eval is None: self._max_episode_length_eval = env_spec.max_episode_length self._eval_env = None self._env_spec = env_spec self._replay_buffer = replay_buffer self.policy = policy self.exploration_policy = exploration_policy self._sampler = sampler self._init_opt()
def __init__( self, env_spec, policy, qf, replay_buffer, *, # Everything after this is numbers. max_path_length, steps_per_epoch=20, n_train_steps=50, max_eval_path_length=None, buffer_batch_size=64, min_buffer_size=int(1e4), exploration_policy=None, target_update_tau=0.01, discount=0.99, policy_weight_decay=0, qf_weight_decay=0, policy_optimizer=torch.optim.Adam, qf_optimizer=torch.optim.Adam, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), clip_pos_returns=False, clip_return=np.inf, max_action=None, reward_scale=1., smooth_return=True): action_bound = env_spec.action_space.high self._tau = target_update_tau self._policy_weight_decay = policy_weight_decay self._qf_weight_decay = qf_weight_decay self._clip_pos_returns = clip_pos_returns self._clip_return = clip_return self._max_action = action_bound if max_action is None else max_action self._steps_per_epoch = steps_per_epoch self._episode_rewards = [] self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] self._policy = policy self._qf = qf self._n_train_steps = n_train_steps self._min_buffer_size = min_buffer_size self._qf = qf self._steps_per_epoch = steps_per_epoch self._n_train_steps = n_train_steps self._buffer_batch_size = buffer_batch_size self._discount = discount self._reward_scale = reward_scale self._smooth_return = smooth_return self.max_path_length = max_path_length self._max_eval_path_length = max_eval_path_length self.env_spec = env_spec self.replay_buffer = replay_buffer self.policy = policy self.exploration_policy = exploration_policy self._target_policy = copy.deepcopy(self.policy) self._target_qf = copy.deepcopy(self._qf) self._policy_optimizer = make_optimizer(policy_optimizer, module=self.policy, lr=policy_lr) self._qf_optimizer = make_optimizer(qf_optimizer, module=self._qf, lr=qf_lr) self._eval_env = None self.sampler_cls = LocalSampler
def __init__( self, env_spec, policy, qf, replay_buffer, *, # Everything after this is numbers. steps_per_epoch=20, n_train_steps=50, max_path_length=None, max_eval_path_length=None, buffer_batch_size=64, min_buffer_size=int(1e4), rollout_batch_size=1, exploration_policy=None, target_update_tau=0.01, discount=0.99, policy_weight_decay=0, qf_weight_decay=0, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), clip_pos_returns=False, clip_return=np.inf, max_action=None, reward_scale=1., smooth_return=True, name='DDPG'): action_bound = env_spec.action_space.high self._max_action = action_bound if max_action is None else max_action self._tau = target_update_tau self._policy_weight_decay = policy_weight_decay self._qf_weight_decay = qf_weight_decay self._name = name self._clip_pos_returns = clip_pos_returns self._clip_return = clip_return self._success_history = deque(maxlen=100) self._episode_rewards = [] self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] self._target_policy = policy.clone('target_policy') self._target_qf = qf.clone('target_qf') self._policy_optimizer = policy_optimizer self._qf_optimizer = qf_optimizer self._policy_lr = policy_lr self._qf_lr = qf_lr self._min_buffer_size = min_buffer_size self._qf = qf self._steps_per_epoch = steps_per_epoch self._n_train_steps = n_train_steps self._buffer_batch_size = buffer_batch_size self._discount = discount self._reward_scale = reward_scale self._smooth_return = smooth_return self.max_path_length = max_path_length self._max_eval_path_length = max_eval_path_length # used by OffPolicyVectorizedSampler self.rollout_batch_size = rollout_batch_size self.env_spec = env_spec self.replay_buffer = replay_buffer self.policy = policy self.exploration_policy = exploration_policy self.sampler_cls = OffPolicyVectorizedSampler self.init_opt()
def __init__( self, env_spec, policy, qf, replay_buffer, *, # Everything after this is numbers. steps_per_epoch=20, n_train_steps=50, max_path_length=None, max_eval_path_length=None, buffer_batch_size=64, min_buffer_size=int(1e4), rollout_batch_size=1, exploration_policy=None, target_update_tau=0.01, discount=0.99, policy_weight_decay=0, qf_weight_decay=0, policy_optimizer=torch.optim.Adam, qf_optimizer=torch.optim.Adam, policy_lr=_Default(1e-4), qf_lr=_Default(1e-3), clip_pos_returns=False, clip_return=np.inf, max_action=None, reward_scale=1., smooth_return=True): action_bound = env_spec.action_space.high self._tau = target_update_tau self._policy_weight_decay = policy_weight_decay self._qf_weight_decay = qf_weight_decay self._clip_pos_returns = clip_pos_returns self._clip_return = clip_return self._max_action = action_bound if max_action is None else max_action self._success_history = deque(maxlen=100) self._episode_rewards = [] self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] super().__init__(env_spec=env_spec, policy=policy, qf=qf, n_train_steps=n_train_steps, steps_per_epoch=steps_per_epoch, max_path_length=max_path_length, max_eval_path_length=max_eval_path_length, buffer_batch_size=buffer_batch_size, min_buffer_size=min_buffer_size, rollout_batch_size=rollout_batch_size, exploration_policy=exploration_policy, replay_buffer=replay_buffer, use_target=True, discount=discount, reward_scale=reward_scale, smooth_return=smooth_return) self._target_policy = copy.deepcopy(self.policy) self._target_qf = copy.deepcopy(self.qf) self._policy_optimizer = make_optimizer(policy_optimizer, module=self.policy, lr=policy_lr) self._qf_optimizer = make_optimizer(qf_optimizer, module=self.qf, lr=qf_lr)