def test_ddpg_compilation(self): """Test whether DDPG can be built with both frameworks.""" config = ddpg.DDPGConfig() config.num_workers = 0 config.num_envs_per_worker = 2 config.replay_buffer_config["learning_starts"] = 0 explore = config.exploration_config.update({"random_timesteps": 100}) config.exploration(exploration_config=explore) num_iterations = 1 # Test against all frameworks. for _ in framework_iterator(config, with_eager_tracing=True): algo = config.build(env="Pendulum-v1") for i in range(num_iterations): results = algo.train() check_train_results(results) print(results) check_compute_single_action(algo) # Ensure apply_gradient_fn is being called and updating global_step pol = algo.get_policy() if config.framework_str == "tf": a = pol.get_session().run(pol.global_step) else: a = pol.global_step check(a, 500) algo.stop()
def test_ddpg_exploration_and_with_random_prerun(self): """Tests DDPG's Exploration (w/ random actions for n timesteps).""" core_config = ddpg.DDPGConfig().rollouts(num_rollout_workers=0) obs = np.array([0.0, 0.1, -0.1]) # Test against all frameworks. for _ in framework_iterator(core_config): config = copy.deepcopy(core_config) # Default OUNoise setup. algo = config.build(env="Pendulum-v1") # Setting explore=False should always return the same action. a_ = algo.compute_single_action(obs, explore=False) check(algo.get_policy().global_timestep, 1) for i in range(50): a = algo.compute_single_action(obs, explore=False) check(algo.get_policy().global_timestep, i + 2) check(a, a_) # explore=None (default: explore) should return different actions. actions = [] for i in range(50): actions.append(algo.compute_single_action(obs)) check(algo.get_policy().global_timestep, i + 52) check(np.std(actions), 0.0, false=True) algo.stop() # Check randomness at beginning. config.exploration_config.update( { # Act randomly at beginning ... "random_timesteps": 50, # Then act very closely to deterministic actions thereafter. "ou_base_scale": 0.001, "initial_scale": 0.001, "final_scale": 0.001, } ) algo = ddpg.DDPG(config=config, env="Pendulum-v1") # ts=0 (get a deterministic action as per explore=False). deterministic_action = algo.compute_single_action(obs, explore=False) check(algo.get_policy().global_timestep, 1) # ts=1-49 (in random window). random_a = [] for i in range(1, 50): random_a.append(algo.compute_single_action(obs, explore=True)) check(algo.get_policy().global_timestep, i + 1) check(random_a[-1], deterministic_action, false=True) self.assertTrue(np.std(random_a) > 0.5) # ts > 50 (a=deterministic_action + scale * N[0,1]) for i in range(50): a = algo.compute_single_action(obs, explore=True) check(algo.get_policy().global_timestep, i + 51) check(a, deterministic_action, rtol=0.1) # ts >> 50 (BUT: explore=False -> expect deterministic action). for i in range(50): a = algo.compute_single_action(obs, explore=False) check(algo.get_policy().global_timestep, i + 101) check(a, deterministic_action) algo.stop()
def test_ddpg_loss_function(self): """Tests DDPG loss function results across all frameworks.""" config = ddpg.DDPGConfig() # Run locally. config.seed = 42 config.num_workers = 0 config.twin_q = True config.use_huber = True config.huber_threshold = 1.0 config.gamma = 0.99 # Make this small (seems to introduce errors). config.l2_reg = 1e-10 config.replay_buffer_config = { "type": "MultiAgentReplayBuffer", "capacity": 50000, "learning_starts": 0, } # Use very simple nets. config.actor_hiddens = [10] config.critic_hiddens = [10] # Make sure, timing differences do not affect Algorithm.train(). config.min_time_s_per_iteration = 0 config.min_sample_timesteps_per_iteration = 100 map_ = { # Normal net. "default_policy/actor_hidden_0/kernel": "policy_model.action_0." "_model.0.weight", "default_policy/actor_hidden_0/bias": "policy_model.action_0." "_model.0.bias", "default_policy/actor_out/kernel": "policy_model.action_out." "_model.0.weight", "default_policy/actor_out/bias": "policy_model.action_out._model.0.bias", "default_policy/sequential/q_hidden_0/kernel": "q_model.q_hidden_0" "._model.0.weight", "default_policy/sequential/q_hidden_0/bias": "q_model.q_hidden_0." "_model.0.bias", "default_policy/sequential/q_out/kernel": "q_model.q_out._model." "0.weight", "default_policy/sequential/q_out/bias": "q_model.q_out._model.0.bias", # -- twin. "default_policy/sequential_1/twin_q_hidden_0/kernel": "twin_" "q_model.twin_q_hidden_0._model.0.weight", "default_policy/sequential_1/twin_q_hidden_0/bias": "twin_" "q_model.twin_q_hidden_0._model.0.bias", "default_policy/sequential_1/twin_q_out/kernel": "twin_" "q_model.twin_q_out._model.0.weight", "default_policy/sequential_1/twin_q_out/bias": "twin_" "q_model.twin_q_out._model.0.bias", # Target net. "default_policy/actor_hidden_0_1/kernel": "policy_model.action_0." "_model.0.weight", "default_policy/actor_hidden_0_1/bias": "policy_model.action_0." "_model.0.bias", "default_policy/actor_out_1/kernel": "policy_model.action_out." "_model.0.weight", "default_policy/actor_out_1/bias": "policy_model.action_out._model" ".0.bias", "default_policy/sequential_2/q_hidden_0/kernel": "q_model." "q_hidden_0._model.0.weight", "default_policy/sequential_2/q_hidden_0/bias": "q_model." "q_hidden_0._model.0.bias", "default_policy/sequential_2/q_out/kernel": "q_model." "q_out._model.0.weight", "default_policy/sequential_2/q_out/bias": "q_model.q_out._model.0.bias", # -- twin. "default_policy/sequential_3/twin_q_hidden_0/kernel": "twin_" "q_model.twin_q_hidden_0._model.0.weight", "default_policy/sequential_3/twin_q_hidden_0/bias": "twin_" "q_model.twin_q_hidden_0._model.0.bias", "default_policy/sequential_3/twin_q_out/kernel": "twin_" "q_model.twin_q_out._model.0.weight", "default_policy/sequential_3/twin_q_out/bias": "twin_" "q_model.twin_q_out._model.0.bias", } env = SimpleEnv batch_size = 100 obs_size = (batch_size, 1) actions = np.random.random(size=(batch_size, 1)) # Batch of size=n. input_ = self._get_batch_helper(obs_size, actions, batch_size) # Simply compare loss values AND grads of all frameworks with each # other. prev_fw_loss = weights_dict = None expect_c, expect_a, expect_t = None, None, None # History of tf-updated NN-weights over n training steps. tf_updated_weights = [] # History of input batches used. tf_inputs = [] for fw, sess in framework_iterator( config, frameworks=("tf", "torch"), session=True ): # Generate Algorithm and get its default Policy object. algo = config.build(env=env) policy = algo.get_policy() p_sess = None if sess: p_sess = policy.get_session() # Set all weights (of all nets) to fixed values. if weights_dict is None: assert fw == "tf" # Start with the tf vars-dict. weights_dict = policy.get_weights() else: assert fw == "torch" # Then transfer that to torch Model. model_dict = self._translate_weights_to_torch(weights_dict, map_) policy.model.load_state_dict(model_dict) policy.target_model.load_state_dict(model_dict) if fw == "torch": # Actually convert to torch tensors. input_ = policy._lazy_tensor_dict(input_) input_ = {k: input_[k] for k in input_.keys()} # Only run the expectation once, should be the same anyways # for all frameworks. if expect_c is None: expect_c, expect_a, expect_t = self._ddpg_loss_helper( input_, weights_dict, sorted(weights_dict.keys()), fw, gamma=config.gamma, huber_threshold=config.huber_threshold, l2_reg=config.l2_reg, sess=sess, ) # Get actual outs and compare to expectation AND previous # framework. c=critic, a=actor, e=entropy, t=td-error. if fw == "tf": c, a, t, tf_c_grads, tf_a_grads = p_sess.run( [ policy.critic_loss, policy.actor_loss, policy.td_error, policy._critic_optimizer.compute_gradients( policy.critic_loss, policy.model.q_variables() ), policy._actor_optimizer.compute_gradients( policy.actor_loss, policy.model.policy_variables() ), ], feed_dict=policy._get_loss_inputs_dict(input_, shuffle=False), ) # Check pure loss values. check(c, expect_c) check(a, expect_a) check(t, expect_t) tf_c_grads = [g for g, v in tf_c_grads] tf_a_grads = [g for g, v in tf_a_grads] elif fw == "torch": policy.loss(policy.model, None, input_) c, a, t = ( policy.get_tower_stats("critic_loss")[0], policy.get_tower_stats("actor_loss")[0], policy.get_tower_stats("td_error")[0], ) # Check pure loss values. check(c, expect_c) check(a, expect_a) check(t, expect_t) # Test actor gradients. policy._actor_optimizer.zero_grad() assert all(v.grad is None for v in policy.model.q_variables()) assert all(v.grad is None for v in policy.model.policy_variables()) a.backward() # `actor_loss` depends on Q-net vars # (but not twin-Q-net vars!). assert not any(v.grad is None for v in policy.model.q_variables()[:4]) assert all(v.grad is None for v in policy.model.q_variables()[4:]) assert not all( torch.mean(v.grad) == 0 for v in policy.model.policy_variables() ) assert not all( torch.min(v.grad) == 0 for v in policy.model.policy_variables() ) # Compare with tf ones. torch_a_grads = [v.grad for v in policy.model.policy_variables()] for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g.cpu())) else: check(tf_g, torch_g) # Test critic gradients. policy._critic_optimizer.zero_grad() assert all( v.grad is None or torch.mean(v.grad) == 0.0 for v in policy.model.q_variables() ) assert all( v.grad is None or torch.min(v.grad) == 0.0 for v in policy.model.q_variables() ) c.backward() assert not all( torch.mean(v.grad) == 0 for v in policy.model.q_variables() ) assert not all( torch.min(v.grad) == 0 for v in policy.model.q_variables() ) # Compare with tf ones. torch_c_grads = [v.grad for v in policy.model.q_variables()] for tf_g, torch_g in zip(tf_c_grads, torch_c_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g.cpu())) else: check(tf_g, torch_g) # Compare (unchanged(!) actor grads) with tf ones. torch_a_grads = [v.grad for v in policy.model.policy_variables()] for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g.cpu())) else: check(tf_g, torch_g) # Store this framework's losses in prev_fw_loss to compare with # next framework's outputs. if prev_fw_loss is not None: check(c, prev_fw_loss[0]) check(a, prev_fw_loss[1]) check(t, prev_fw_loss[2]) prev_fw_loss = (c, a, t) # Update weights from our batch (n times). for update_iteration in range(6): print("train iteration {}".format(update_iteration)) if fw == "tf": in_ = self._get_batch_helper(obs_size, actions, batch_size) tf_inputs.append(in_) # Set a fake-batch to use # (instead of sampling from replay buffer). buf = algo.local_replay_buffer patch_buffer_with_fake_sampling_method(buf, in_) algo.train() updated_weights = policy.get_weights() # Net must have changed. if tf_updated_weights: check( updated_weights["default_policy/actor_hidden_0/kernel"], tf_updated_weights[-1][ "default_policy/actor_hidden_0/kernel" ], false=True, ) tf_updated_weights.append(updated_weights) # Compare with updated tf-weights. Must all be the same. else: tf_weights = tf_updated_weights[update_iteration] in_ = tf_inputs[update_iteration] # Set a fake-batch to use # (instead of sampling from replay buffer). buf = algo.local_replay_buffer patch_buffer_with_fake_sampling_method(buf, in_) algo.train() # Compare updated model and target weights. for tf_key in tf_weights.keys(): tf_var = tf_weights[tf_key] # Model. if re.search( "actor_out_1|actor_hidden_0_1|sequential_[23]", tf_key ): torch_var = policy.target_model.state_dict()[map_[tf_key]] # Target model. else: torch_var = policy.model.state_dict()[map_[tf_key]] if tf_var.shape != torch_var.shape: check(tf_var, np.transpose(torch_var.cpu()), atol=0.1) else: check(tf_var, torch_var, atol=0.1) algo.stop()
def _import_ddpg(): import ray.rllib.algorithms.ddpg as ddpg return ddpg.DDPG, ddpg.DDPGConfig().to_dict()