def test_get_optimizer(self): """Tests get_optimizer. """ default_optimizer_fn, optimizer_class = opt.get_optimizer_fn( opt.default_optimization_hparams()["optimizer"]) default_optimizer = default_optimizer_fn(1.0) self.assertTrue(optimizer_class, tf.train.Optimizer) self.assertIsInstance(default_optimizer, tf.train.AdamOptimizer) hparams = { "type": "MomentumOptimizer", "kwargs": { "learning_rate": 0.001, "momentum": 0.9, "use_nesterov": True } } momentum_optimizer_fn, _ = opt.get_optimizer_fn(hparams) momentum_optimizer = momentum_optimizer_fn() self.assertIsInstance(momentum_optimizer, tf.train.MomentumOptimizer) hparams = { "type": tf.train.MomentumOptimizer, "kwargs": { "momentum": 0.9, "use_nesterov": True } } momentum_optimizer_fn, _ = opt.get_optimizer_fn(hparams) momentum_optimizer = momentum_optimizer_fn(0.001) self.assertIsInstance(momentum_optimizer, tf.train.MomentumOptimizer) hparams = {"type": tf.train.MomentumOptimizer(0.001, 0.9)} momentum_optimizer, _ = opt.get_optimizer_fn(hparams) self.assertIsInstance(momentum_optimizer, tf.train.MomentumOptimizer)
def default_hparams(): """Returns a dictionary of hyperparameters with default values: .. role:: python(code) :language: python .. code-block:: python { 'policy_type': 'CategoricalPolicyNet', 'policy_hparams': None, 'discount_factor': 0.95, 'normalize_reward': False, 'optimization': default_optimization_hparams(), 'name': 'pg_agent', } Here: "policy_type": str or class or instance Policy net. Can be class, its name or module path, or a class instance. If class name is given, the class must be from module :mod:`texar.tf.modules` or :mod:`texar.tf.custom`. Ignored if a `policy` is given to the agent constructor. "policy_hparams": dict, optional Hyperparameters for the policy net. With the :attr:`policy_kwargs` argument to the constructor, a network is created with :python:`policy_class(**policy_kwargs, hparams=policy_hparams)`. "discount_factor": float The discount factor of reward. "normalize_reward": bool Whether to normalize the discounted reward, by `(discounted_reward - mean) / std`. "optimization": dict Hyperparameters of optimization for updating the policy net. See :func:`~texar.tf.core.default_optimization_hparams` for details. "name": str Name of the agent. """ return { 'policy_type': 'CategoricalPolicyNet', 'policy_hparams': None, 'discount_factor': 0.95, 'normalize_reward': False, 'optimization': opt.default_optimization_hparams(), 'name': 'pg_agent', }
def test_get_learning_rate_decay_fn(self): # pylint: disable=too-many-locals """Tests get_learning_rate_decay_fn. """ default_lr_decay_fn = opt.get_learning_rate_decay_fn( opt.default_optimization_hparams()["learning_rate_decay"]) self.assertIsNone(default_lr_decay_fn) boundaries = [2, 4] values = [0.1, 0.01, 0.001] hparams = { "type": "piecewise_constant", "kwargs": { "boundaries": boundaries, "values": values }, "min_learning_rate": 0.05, "start_decay_step": 1, "end_decay_step": utils.MAX_SEQ_LENGTH, } pc_lr_decay_fn = opt.get_learning_rate_decay_fn(hparams) global_step = 1 pc_lr = pc_lr_decay_fn(learning_rate=1., global_step=global_step) pc_lr_true = tf.train.piecewise_constant( global_step - hparams["start_decay_step"], boundaries, values) hparams["type"] = "natural_exp_decay" hparams["kwargs"] = {"decay_steps": 1, "decay_rate": 0.5} ned_lr_decay_fn = opt.get_learning_rate_decay_fn(hparams) ned_lr = ned_lr_decay_fn(learning_rate=1., global_step=global_step) ned_lr_true = tf.train.natural_exp_decay( 1., global_step - hparams["start_decay_step"], hparams["kwargs"]["decay_steps"], hparams["kwargs"]["decay_rate"]) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) pc_lr_, pc_lr_true_, ned_lr_, ned_lr_true_ = sess.run( [pc_lr, pc_lr_true, ned_lr, ned_lr_true]) self.assertEqual(pc_lr_, pc_lr_true_) self.assertEqual(ned_lr_, ned_lr_true_)
def test_get_gradient_clip_fn(self): # pylint: disable=too-many-locals """Tests get_gradient_clip_fn. """ default_grad_clip_fn = opt.get_gradient_clip_fn( opt.default_optimization_hparams()["gradient_clip"]) self.assertIsNone(default_grad_clip_fn) grads = [tf.random_uniform([10, 10], -1., 1.) for _ in range(5)] grads_and_vars = list(zip(grads, range(5))) hparams = {"type": "clip_by_global_norm", "kwargs": {"clip_norm": 0.1}} gn_grad_clip_fn = opt.get_gradient_clip_fn(hparams) gn_grads_and_vars = gn_grad_clip_fn(grads_and_vars) gn_grads, _ = zip(*gn_grads_and_vars) gn_grads_true, _ = tf.clip_by_global_norm( grads, hparams["kwargs"]["clip_norm"]) hparams = { "type": "clip_by_value", "kwargs": { "clip_value_min": -0.01, "clip_value_max": 0.01 } } v_grad_clip_fn = opt.get_gradient_clip_fn(hparams) v_grads_and_vars = v_grad_clip_fn(grads_and_vars) v_grads, _ = zip(*v_grads_and_vars) v_grads_true = tf.clip_by_value(grads, hparams["kwargs"]["clip_value_min"], hparams["kwargs"]["clip_value_max"]) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) gn_grads_, gn_grads_true_, v_grads_, v_grads_true_ = sess.run( [gn_grads, gn_grads_true, v_grads, v_grads_true]) np.testing.assert_array_equal(gn_grads_, gn_grads_true_) np.testing.assert_array_equal(v_grads_, v_grads_true_)
def default_hparams(): """Returns a dictionary of hyperparameters with default values: .. role:: python(code) :language: python .. code-block:: python { 'qnet_type': 'CategoricalQNet', 'qnet_hparams': None, 'replay_memory_type': 'DequeReplayMemory', 'replay_memory_hparams': None, 'exploration_type': 'EpsilonLinearDecayExploration', 'exploration_hparams': None, 'optimization': opt.default_optimization_hparams(), 'target_update_strategy': 'copy', 'cold_start_steps': 100, 'sample_batch_size': 32, 'update_period': 100, 'discount_factor': 0.95, 'name': 'dqn_agent' } Here: "qnet_type": str or class or instance Q-value net. Can be class, its name or module path, or a class instance. If class name is given, the class must be from module :mod:`texar.tf.modules` or :mod:`texar.tf.custom`. Ignored if a `qnet` is given to the agent constructor. "qnet_hparams": dict, optional Hyperparameters for the Q net. With the :attr:`qnet_kwargs` argument to the constructor, a network is created with :python:`qnet_class(**qnet_kwargs, hparams=qnet_hparams)`. "replay_memory_type": str or class or instance Replay memory class. Can be class, its name or module path, or a class instance. If class name is given, the class must be from module :mod:`texar.tf.core` or :mod:`texar.tf.custom`. Ignored if a `replay_memory` is given to the agent constructor. "replay_memory_hparams": dict, optional Hyperparameters for the replay memory. With the :attr:`replay_memory_kwargs` argument to the constructor, a network is created with :python:`replay_memory_class( **replay_memory_kwargs, hparams=replay_memory_hparams)`. "exploration_type": str or class or instance Exploration class. Can be class, its name or module path, or a class instance. If class name is given, the class must be from module :mod:`texar.tf.core` or :mod:`texar.tf.custom`. Ignored if a `exploration` is given to the agent constructor. "exploration_hparams": dict, optional Hyperparameters for the exploration class. With the :attr:`exploration_kwargs` argument to the constructor, a network is created with :python:`exploration_class( **exploration_kwargs, hparams=exploration_hparams)`. "optimization": dict Hyperparameters of optimization for updating the Q-net. See :func:`~texar.tf.core.default_optimization_hparams` for details. "cold_start_steps": int In the beginning, Q-net is not trained in the first few steps. "sample_batch_size": int The number of samples taken in replay memory when training. "target_update_strategy": string - If **"copy"**, the target network is assigned with the parameter \ of Q-net every :attr:`"update_period"` steps. - If **"tau"**, target will be updated by assigning as \ ``` (1 - 1/update_period) * target + 1/update_period * qnet ``` "update_period": int Frequecy of updating the target network, i.e., updating the target once for every "update_period" steps. "discount_factor": float The discount factor of reward. "name": str Name of the agent. """ return { 'qnet_type': 'CategoricalQNet', 'qnet_hparams': None, 'replay_memory_type': 'DequeReplayMemory', 'replay_memory_hparams': None, 'exploration_type': 'EpsilonLinearDecayExploration', 'exploration_hparams': None, 'optimization': opt.default_optimization_hparams(), 'target_update_strategy': 'copy', 'cold_start_steps': 100, 'sample_batch_size': 32, 'update_period': 100, 'discount_factor': 0.95, 'name': 'dqn_agent' }
def default_hparams(): """Returns a dictionary of hyperparameters with default values: .. role:: python(code) :language: python .. code-block:: python { 'discount_factor': 0.95, 'normalize_reward': False, 'entropy_weight': 0., 'loss': { 'average_across_batch': True, 'average_across_timesteps': False, 'sum_over_batch': False, 'sum_over_timesteps': True, 'time_major': False }, 'optimization': default_optimization_hparams(), 'name': 'pg_agent', } Here: "discount_factor": float The discount factor of reward. "normalize_reward": bool Whether to normalize the discounted reward, by `(discounted_reward - mean) / std`. Here `mean` and `std` are over all time steps and all samples in the batch. "entropy_weight": float The weight of entropy loss of the sample distribution, to encourage maximizing the Shannon entropy. Set to 0 to disable the loss. "loss": dict Extra keyword arguments for :func:`~texar.tf.losses.pg_loss_with_logits`, including the reduce arguments (e.g., `average_across_batch`) and `time_major` "optimization": dict Hyperparameters of optimization for updating the policy net. See :func:`~texar.tf.core.default_optimization_hparams` for details. "name": str Name of the agent. """ return { 'discount_factor': 0.95, 'normalize_reward': False, 'entropy_weight': 0., 'loss': { 'average_across_batch': True, 'average_across_timesteps': False, 'sum_over_batch': False, 'sum_over_timesteps': True, 'time_major': False }, 'optimization': opt.default_optimization_hparams(), 'name': 'pg_agent', }