def test_ppo_loss_function(self): """Tests the PPO loss function math.""" config = copy.deepcopy(ppo.DEFAULT_CONFIG) config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" config["model"]["vf_share_layers"] = True for fw, sess in framework_iterator(config, session=True): trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Check no free log std var by default. if fw == "torch": matching = [ v for (n, v) in policy.model.named_parameters() if "log_std" in n ] else: matching = [ v for v in policy.model.trainable_variables() if "log_std" in str(v) ] assert len(matching) == 0, matching # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = # [0.50005, -0.505, 0.5] train_batch = compute_gae_for_sample_batch(policy, FAKE_BATCH.copy()) if fw == "torch": train_batch = policy._lazy_tensor_dict(train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss. if fw in ["tf2", "tfe"]: ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch) elif fw == "torch": ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical, train_batch) vars = policy.model.variables() if fw != "torch" else \ list(policy.model.parameters()) if fw == "tf": vars = policy.get_session().run(vars) expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw) expected_logits = fc(expected_shared_out, vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw) expected_value_outs = fc(expected_shared_out, vars[4], vars[5], framework=fw) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, Categorical if fw != "torch" else TorchCategorical, train_batch, expected_logits, expected_value_outs, sess=sess ) if sess: policy_sess = policy.get_session() k, e, pl, v, tl = policy_sess.run( [ policy._mean_kl, policy._mean_entropy, policy._mean_policy_loss, policy._mean_vf_loss, policy._total_loss, ], feed_dict=policy._get_loss_inputs_dict(train_batch, shuffle=False)) check(k, kl) check(e, entropy) check(pl, np.mean(-pg_loss)) check(v, np.mean(vf_loss), decimals=4) check(tl, overall_loss, decimals=4) else: check(policy._mean_kl, kl) check(policy._mean_entropy, entropy) check(policy._mean_policy_loss, np.mean(-pg_loss)) check(policy._mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy._total_loss, overall_loss, decimals=4) trainer.stop()
def _sac_loss_helper(self, train_batch, weights, ks, log_alpha, fw, gamma, sess): """Emulates SAC loss functions for tf and torch.""" # ks: # 0=log_alpha # 1=target log-alpha (not used) # 2=action hidden bias # 3=action hidden kernel # 4=action out bias # 5=action out kernel # 6=Q hidden bias # 7=Q hidden kernel # 8=Q out bias # 9=Q out kernel # 14=target Q hidden bias # 15=target Q hidden kernel # 16=target Q out bias # 17=target Q out kernel alpha = np.exp(log_alpha) cls = TorchSquashedGaussian if fw == "torch" else SquashedGaussian model_out_t = train_batch[SampleBatch.CUR_OBS] model_out_tp1 = train_batch[SampleBatch.NEXT_OBS] target_model_out_tp1 = train_batch[SampleBatch.NEXT_OBS] # get_policy_output action_dist_t = cls( fc( relu( fc(model_out_t, weights[ks[3]], weights[ks[2]], framework=fw)), weights[ks[5]], weights[ks[4]]), None) policy_t = action_dist_t.deterministic_sample() log_pis_t = action_dist_t.logp(policy_t) if sess: log_pis_t = sess.run(log_pis_t) policy_t = sess.run(policy_t) log_pis_t = np.expand_dims(log_pis_t, -1) # Get policy output for t+1. action_dist_tp1 = cls( fc( relu( fc(model_out_tp1, weights[ks[3]], weights[ks[2]], framework=fw)), weights[ks[5]], weights[ks[4]]), None) policy_tp1 = action_dist_tp1.deterministic_sample() log_pis_tp1 = action_dist_tp1.logp(policy_tp1) if sess: log_pis_tp1 = sess.run(log_pis_tp1) policy_tp1 = sess.run(policy_tp1) log_pis_tp1 = np.expand_dims(log_pis_tp1, -1) # Q-values for the actually selected actions. # get_q_values q_t = fc(relu( fc(np.concatenate([model_out_t, train_batch[SampleBatch.ACTIONS]], -1), weights[ks[7]], weights[ks[6]], framework=fw)), weights[ks[9]], weights[ks[8]], framework=fw) # Q-values for current policy in given current state. # get_q_values q_t_det_policy = fc(relu( fc(np.concatenate([model_out_t, policy_t], -1), weights[ks[7]], weights[ks[6]], framework=fw)), weights[ks[9]], weights[ks[8]], framework=fw) # Target q network evaluation. # target_model.get_q_values q_tp1 = fc(relu( fc(np.concatenate([target_model_out_tp1, policy_tp1], -1), weights[ks[15]], weights[ks[14]], framework=fw)), weights[ks[17]], weights[ks[16]], framework=fw) q_t_selected = np.squeeze(q_t, axis=-1) q_tp1 -= alpha * log_pis_tp1 q_tp1_best = np.squeeze(q_tp1, axis=-1) dones = train_batch[SampleBatch.DONES] rewards = train_batch[SampleBatch.REWARDS] if fw == "torch": dones = dones.float().numpy() rewards = rewards.numpy() q_tp1_best_masked = (1.0 - dones) * q_tp1_best q_t_selected_target = rewards + gamma * q_tp1_best_masked base_td_error = np.abs(q_t_selected - q_t_selected_target) td_error = base_td_error critic_loss = [ 0.5 * np.mean(np.power(q_t_selected_target - q_t_selected, 2.0)) ] target_entropy = -np.prod((1, )) alpha_loss = -np.mean(log_alpha * (log_pis_t + target_entropy)) actor_loss = np.mean(alpha * log_pis_t - q_t_det_policy) return critic_loss, actor_loss, alpha_loss, td_error
def test_simple_q_loss_function(self): """Tests the Simple-Q loss function results on all frameworks.""" config = dqn.simple_q.SimpleQConfig().rollouts(num_rollout_workers=0) # Use very simple net (layer0=10 nodes, q-layer=2 nodes (2 actions)). config.training(model={ "fcnet_hiddens": [10], "fcnet_activation": "linear", }) for fw in framework_iterator(config): # Generate Trainer and get its default Policy object. trainer = dqn.SimpleQTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Batch of size=2. input_ = SampleBatch({ SampleBatch.CUR_OBS: np.random.random(size=(2, 4)), SampleBatch.ACTIONS: np.array([0, 1]), SampleBatch.REWARDS: np.array([0.4, -1.23]), SampleBatch.DONES: np.array([False, False]), SampleBatch.NEXT_OBS: np.random.random(size=(2, 4)), SampleBatch.EPS_ID: np.array([1234, 1234]), SampleBatch.AGENT_INDEX: np.array([0, 0]), SampleBatch.ACTION_LOGP: np.array([-0.1, -0.1]), SampleBatch.ACTION_DIST_INPUTS: np.array([[0.1, 0.2], [-0.1, -0.2]]), SampleBatch.ACTION_PROB: np.array([0.1, 0.2]), "q_values": np.array([[0.1, 0.2], [0.2, 0.1]]), }) # Get model vars for computing expected model outs (q-vals). # 0=layer-kernel; 1=layer-bias; 2=q-val-kernel; 3=q-val-bias vars = policy.get_weights() if isinstance(vars, dict): vars = list(vars.values()) vars_t = policy.target_model.variables() if fw == "tf": vars_t = policy.get_session().run(vars_t) # Q(s,a) outputs. q_t = np.sum( one_hot(input_[SampleBatch.ACTIONS], 2) * fc( fc( input_[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw, ), vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw, ), 1, ) # max[a'](Qtarget(s',a')) outputs. q_target_tp1 = np.max( fc( fc( input_[SampleBatch.NEXT_OBS], vars_t[0 if fw != "torch" else 2], vars_t[1 if fw != "torch" else 3], framework=fw, ), vars_t[2 if fw != "torch" else 0], vars_t[3 if fw != "torch" else 1], framework=fw, ), 1, ) # TD-errors (Bellman equation). td_error = q_t - config.gamma * input_[ SampleBatch.REWARDS] + q_target_tp1 # Huber/Square loss on TD-error. expected_loss = huber_loss(td_error).mean() if fw == "torch": input_ = policy._lazy_tensor_dict(input_) # Get actual out and compare. if fw == "tf": out = policy.get_session().run( policy._loss, feed_dict=policy._get_loss_inputs_dict(input_, shuffle=False), ) else: out = (loss_torch if fw == "torch" else loss_tf)(policy, policy.model, None, input_) check(out, expected_loss, decimals=1)
def test_ppo_loss_function(self): """Tests the PPO loss function math.""" config = ppo.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" config["vf_share_layers"] = True # Fake CartPole episode of n time steps. train_batch = { SampleBatch.CUR_OBS: np.array( [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]], dtype=np.float32), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32), SampleBatch.ACTION_DIST_INPUTS: np.array( [[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32), SampleBatch.ACTION_LOGP: np.array( [-0.5, -0.1, -0.2], dtype=np.float32), } for fw in ["tf", "torch"]: print("framework={}".format(fw)) config["use_pytorch"] = fw == "torch" config["eager"] = fw == "tf" trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = # [0.50005, -0.505, 0.5] if fw == "tf": train_batch = postprocess_ppo_gae_tf(policy, train_batch) else: train_batch = postprocess_ppo_gae_torch(policy, train_batch) train_batch = policy._lazy_tensor_dict(train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss (results are stored in policy.loss_obj) # for tf. if fw == "tf": ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch) else: ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical, train_batch) vars = policy.model.variables() if fw == "tf" else \ list(policy.model.parameters()) expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS], vars[0], vars[1]) expected_logits = fc(expected_shared_out, vars[2], vars[3]) expected_value_outs = fc(expected_shared_out, vars[4], vars[5]) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, Categorical if fw == "tf" else TorchCategorical, train_batch, expected_logits, expected_value_outs ) check(policy.loss_obj.mean_kl, kl) check(policy.loss_obj.mean_entropy, entropy) check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss)) check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy.loss_obj.loss, overall_loss, decimals=4)
def do_test_log_likelihood(run, config, prev_a=None, continuous=False, layer_key=("fc", (0, 4), ("_hidden_layers.0.", "_logits.")), logp_func=None): config = config.copy() # Run locally. config["num_workers"] = 0 # Env setup. if continuous: env = "Pendulum-v0" obs_batch = preprocessed_obs_batch = np.array([[0.0, 0.1, -0.1]]) else: env = "FrozenLake-v0" config["env_config"] = {"is_slippery": False, "map_name": "4x4"} obs_batch = np.array([0]) preprocessed_obs_batch = one_hot(obs_batch, depth=16) prev_r = None if prev_a is None else np.array(0.0) # Test against all frameworks. for fw in framework_iterator(config): if run in [sac.SACTrainer] and fw == "tfe": continue trainer = run(config=config, env=env) policy = trainer.get_policy() vars = policy.get_weights() # Sample n actions, then roughly check their logp against their # counts. num_actions = 1000 if not continuous else 50 actions = [] for _ in range(num_actions): # Single action from single obs. actions.append( trainer.compute_action(obs_batch[0], prev_action=prev_a, prev_reward=prev_r, explore=True)) # Test all taken actions for their log-likelihoods vs expected values. if continuous: for idx in range(num_actions): a = actions[idx] if fw != "torch": if isinstance(vars, list): expected_mean_logstd = fc( fc(obs_batch, vars[layer_key[1][0]]), vars[layer_key[1][1]]) else: expected_mean_logstd = fc( fc( obs_batch, vars["default_policy/{}_1/kernel".format( layer_key[0])]), vars["default_policy/{}_out/kernel".format( layer_key[0])]) else: expected_mean_logstd = fc( fc(obs_batch, vars["{}_model.0.weight".format(layer_key[2][0])], framework=fw), vars["{}_model.0.weight".format(layer_key[2][1])], framework=fw) mean, log_std = np.split(expected_mean_logstd, 2, axis=-1) if logp_func is None: expected_logp = np.log(norm.pdf(a, mean, np.exp(log_std))) else: expected_logp = logp_func(mean, log_std, a) logp = policy.compute_log_likelihoods( np.array([a]), preprocessed_obs_batch, prev_action_batch=np.array([prev_a]), prev_reward_batch=np.array([prev_r])) check(logp, expected_logp[0], rtol=0.2) # Test all available actions for their logp values. else: for a in [0, 1, 2, 3]: count = actions.count(a) expected_prob = count / num_actions logp = policy.compute_log_likelihoods( np.array([a]), preprocessed_obs_batch, prev_action_batch=np.array([prev_a]), prev_reward_batch=np.array([prev_r])) check(np.exp(logp), expected_prob, atol=0.2)
def test_pg_loss_functions(self): """Tests the PG loss function math.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" # Fake CartPole episode of n time steps. train_batch = SampleBatch({ SampleBatch.OBS: np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]]), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, 1.0, 1.0]), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.EPS_ID: np.array([1234, 1234, 1234]), SampleBatch.AGENT_INDEX: np.array([0, 0, 0]), }) for fw, sess in framework_iterator(config, session=True): dist_cls = (Categorical if fw != "torch" else TorchCategorical) trainer = pg.PGTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() vars = policy.model.trainable_variables() if sess: vars = policy.get_session().run(vars) # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] = # [2.9701, 1.99, 1.0] train_batch_ = pg.post_process_advantages(policy, train_batch.copy()) if fw == "torch": train_batch_ = policy._lazy_tensor_dict(train_batch_) # Check Advantage values. check(train_batch_[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0]) # Actual loss results. if sess: results = policy.get_session().run( policy._loss, feed_dict=policy._get_loss_inputs_dict(train_batch_, shuffle=False)) else: results = (pg.pg_tf_loss if fw in ["tf2", "tfe"] else pg.pg_torch_loss)(policy, policy.model, dist_class=dist_cls, train_batch=train_batch_) # Calculate expected results. if fw != "torch": expected_logits = fc(fc(train_batch_[SampleBatch.OBS], vars[0], vars[1], framework=fw), vars[2], vars[3], framework=fw) else: expected_logits = fc(fc(train_batch_[SampleBatch.OBS], vars[2], vars[3], framework=fw), vars[0], vars[1], framework=fw) expected_logp = dist_cls(expected_logits, policy.model).logp( train_batch_[SampleBatch.ACTIONS]) adv = train_batch_[Postprocessing.ADVANTAGES] if sess: expected_logp = sess.run(expected_logp) elif fw == "torch": expected_logp = expected_logp.detach().cpu().numpy() adv = adv.detach().cpu().numpy() else: expected_logp = expected_logp.numpy() expected_loss = -np.mean(expected_logp * adv) check(results, expected_loss, decimals=4)
def _ddpg_loss_helper(self, train_batch, weights, ks, fw, gamma, huber_threshold, l2_reg, sess): """Emulates DDPG loss functions for tf and torch.""" model_out_t = train_batch[SampleBatch.CUR_OBS] target_model_out_tp1 = train_batch[SampleBatch.NEXT_OBS] # get_policy_output policy_t = sigmoid(2.0 * fc( relu(fc(model_out_t, weights[ks[1]], weights[ks[0]], framework=fw)), weights[ks[5]], weights[ks[4]])) # Get policy output for t+1 (target model). policy_tp1 = sigmoid(2.0 * fc( relu( fc(target_model_out_tp1, weights[ks[3]], weights[ks[2]], framework=fw)), weights[ks[7]], weights[ks[6]])) # Assume no smooth target policy. policy_tp1_smoothed = policy_tp1 # Q-values for the actually selected actions. # get_q_values q_t = fc(relu( fc(np.concatenate([model_out_t, train_batch[SampleBatch.ACTIONS]], -1), weights[ks[9]], weights[ks[8]], framework=fw)), weights[ks[11]], weights[ks[10]], framework=fw) twin_q_t = fc(relu( fc(np.concatenate([model_out_t, train_batch[SampleBatch.ACTIONS]], -1), weights[ks[13]], weights[ks[12]], framework=fw)), weights[ks[15]], weights[ks[14]], framework=fw) # Q-values for current policy in given current state. # get_q_values q_t_det_policy = fc(relu( fc(np.concatenate([model_out_t, policy_t], -1), weights[ks[9]], weights[ks[8]], framework=fw)), weights[ks[11]], weights[ks[10]], framework=fw) # Target q network evaluation. # target_model.get_q_values q_tp1 = fc(relu( fc(np.concatenate([target_model_out_tp1, policy_tp1_smoothed], -1), weights[ks[17]], weights[ks[16]], framework=fw)), weights[ks[19]], weights[ks[18]], framework=fw) twin_q_tp1 = fc(relu( fc(np.concatenate([target_model_out_tp1, policy_tp1_smoothed], -1), weights[ks[21]], weights[ks[20]], framework=fw)), weights[ks[23]], weights[ks[22]], framework=fw) q_t_selected = np.squeeze(q_t, axis=-1) twin_q_t_selected = np.squeeze(twin_q_t, axis=-1) q_tp1 = np.minimum(q_tp1, twin_q_tp1) q_tp1_best = np.squeeze(q_tp1, axis=-1) dones = train_batch[SampleBatch.DONES] rewards = train_batch[SampleBatch.REWARDS] if fw == "torch": dones = dones.float().numpy() rewards = rewards.numpy() q_tp1_best_masked = (1.0 - dones) * q_tp1_best q_t_selected_target = rewards + gamma * q_tp1_best_masked td_error = q_t_selected - q_t_selected_target twin_td_error = twin_q_t_selected - q_t_selected_target td_error = td_error + twin_td_error errors = huber_loss(td_error, huber_threshold) + \ huber_loss(twin_td_error, huber_threshold) critic_loss = np.mean(errors) actor_loss = -np.mean(q_t_det_policy) # Add l2-regularization if required. for name, var in weights.items(): if re.match("default_policy/actor_(hidden_0|out)/kernel", name): actor_loss += (l2_reg * l2_loss(var)) elif re.match("default_policy/sequential(_1)?/\\w+/kernel", name): critic_loss += (l2_reg * l2_loss(var)) return critic_loss, actor_loss, td_error
def test_ppo_loss_function(self): """Tests the PPO loss function math.""" config = ppo.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["eager"] = True config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" # Fake CartPole episode of n time steps. train_batch = { SampleBatch.CUR_OBS: np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]], dtype=np.float32), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32), BEHAVIOUR_LOGITS: np.array([[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32), ACTION_LOGP: np.array([-0.5, -0.1, -0.2], dtype=np.float32) } # tf. trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Post-process (calculate simple (non-GAE) advantages) and attach to # train_batch dict. # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = # [0.50005, -0.505, 0.5] train_batch = postprocess_ppo_gae_tf(policy, train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss (results are stored in policy.loss_obj) for # tf. ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch) vars = policy.model.trainable_variables() expected_logits = fc( fc(train_batch[SampleBatch.CUR_OBS], vars[0].numpy(), vars[1].numpy()), vars[4].numpy(), vars[5].numpy()) expected_value_outs = fc( fc(train_batch[SampleBatch.CUR_OBS], vars[2].numpy(), vars[3].numpy()), vars[6].numpy(), vars[7].numpy()) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, Categorical, train_batch, expected_logits, expected_value_outs ) check(policy.loss_obj.mean_kl, kl) check(policy.loss_obj.mean_entropy, entropy) check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss)) check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy.loss_obj.loss, overall_loss, decimals=4) # Torch. config["use_pytorch"] = True trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() train_batch = postprocess_ppo_gae_torch(policy, train_batch) train_batch = policy._lazy_tensor_dict(train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss (results are stored in policy.loss_obj) # for tf. ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical, train_batch) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, TorchCategorical, train_batch, policy.model.last_output(), policy.model.value_function().detach().numpy() ) check(policy.loss_obj.mean_kl, kl) check(policy.loss_obj.mean_entropy, entropy) check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss)) check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy.loss_obj.loss, overall_loss, decimals=4)
def test_simple_q_loss_function(self): """Tests the Simple-Q loss function results on all frameworks.""" config = dqn.SIMPLE_Q_DEFAULT_CONFIG.copy() # Run locally. config["num_workers"] = 0 # Use very simple net (layer0=10 nodes, q-layer=2 nodes (2 actions)). config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" for fw in framework_iterator(config): # Generate Trainer and get its default Policy object. trainer = dqn.SimpleQTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Batch of size=2. input_ = { SampleBatch.CUR_OBS: np.random.random(size=(2, 4)), SampleBatch.ACTIONS: np.array([0, 1]), SampleBatch.REWARDS: np.array([0.4, -1.23]), SampleBatch.DONES: np.array([False, False]), SampleBatch.NEXT_OBS: np.random.random(size=(2, 4)) } # Get model vars for computing expected model outs (q-vals). # 0=layer-kernel; 1=layer-bias; 2=q-val-kernel; 3=q-val-bias vars = policy.get_weights() if isinstance(vars, dict): vars = list(vars.values()) vars_t = policy.target_q_func_vars if fw == "tf": vars_t = policy.get_session().run(vars_t) # Q(s,a) outputs. q_t = np.sum( one_hot(input_[SampleBatch.ACTIONS], 2) * fc(fc(input_[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw), vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw), 1) # max[a'](Qtarget(s',a')) outputs. q_target_tp1 = np.max( fc(fc(input_[SampleBatch.NEXT_OBS], vars_t[0 if fw != "torch" else 2], vars_t[1 if fw != "torch" else 3], framework=fw), vars_t[2 if fw != "torch" else 0], vars_t[3 if fw != "torch" else 1], framework=fw), 1) # TD-errors (Bellman equation). td_error = q_t - config["gamma"] * input_[SampleBatch.REWARDS] + \ q_target_tp1 # Huber/Square loss on TD-error. expected_loss = huber_loss(td_error).mean() if fw == "torch": input_ = policy._lazy_tensor_dict(input_) # Get actual out and compare. if fw == "tf": out = policy.get_session().run( policy._loss, feed_dict=policy._get_loss_inputs_dict(input_, shuffle=False)) else: out = (loss_torch if fw == "torch" else loss_tf)(policy, policy.model, None, input_) check(out, expected_loss, decimals=1)
def test_ppo_loss_function(self): """Tests the PPO loss function math.""" config = ppo.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" config["vf_share_layers"] = True # Fake CartPole episode of n time steps. train_batch = { SampleBatch.CUR_OBS: np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]], dtype=np.float32), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.PREV_ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32), SampleBatch.PREV_REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32), SampleBatch.ACTION_DIST_INPUTS: np.array([[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32), SampleBatch.ACTION_LOGP: np.array([-0.5, -0.1, -0.2], dtype=np.float32), } for fw, sess in framework_iterator(config, session=True): trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = # [0.50005, -0.505, 0.5] if fw == "tf" or fw == "eager": train_batch = postprocess_ppo_gae_tf(policy, train_batch) else: train_batch = postprocess_ppo_gae_torch(policy, train_batch) train_batch = policy._lazy_tensor_dict(train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss. if fw == "eager": ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch) elif fw == "torch": ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical, train_batch) vars = policy.model.variables() if fw != "torch" else \ list(policy.model.parameters()) if fw == "tf": vars = policy.get_session().run(vars) expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw) expected_logits = fc(expected_shared_out, vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw) expected_value_outs = fc(expected_shared_out, vars[4], vars[5], framework=fw) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, Categorical if fw != "torch" else TorchCategorical, train_batch, expected_logits, expected_value_outs, sess=sess ) if sess: policy_sess = policy.get_session() k, e, pl, v, tl = policy_sess.run( [ policy.loss_obj.mean_kl, policy.loss_obj.mean_entropy, policy.loss_obj.mean_policy_loss, policy.loss_obj.mean_vf_loss, policy.loss_obj.loss ], feed_dict=policy._get_loss_inputs_dict(train_batch, shuffle=False)) check(k, kl) check(e, entropy) check(pl, np.mean(-pg_loss)) check(v, np.mean(vf_loss), decimals=4) check(tl, overall_loss, decimals=4) else: check(policy.loss_obj.mean_kl, kl) check(policy.loss_obj.mean_entropy, entropy) check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss)) check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy.loss_obj.loss, overall_loss, decimals=4)
def test_log_likelihood(run, config, prev_a=None, continuous=False, layer_key=("fc", (0, 4)), logp_func=None): config = config.copy() # Run locally. config["num_workers"] = 0 # Env setup. if continuous: env = "Pendulum-v0" obs_batch = preprocessed_obs_batch = np.array([[0.0, 0.1, -0.1]]) else: env = "FrozenLake-v0" config["env_config"] = {"is_slippery": False, "map_name": "4x4"} obs_batch = np.array([0]) preprocessed_obs_batch = one_hot(obs_batch, depth=16) # Use Soft-Q for DQNs. if run is dqn.DQNTrainer: config["exploration_config"] = {"type": "SoftQ", "temperature": 0.5} prev_r = None if prev_a is None else np.array(0.0) # Test against all frameworks. for fw in ["tf", "eager", "torch"]: if run in [dqn.DQNTrainer, sac.SACTrainer] and fw == "torch": continue print("Testing {} with framework={}".format(run, fw)) config["eager"] = True if fw == "eager" else False config["use_pytorch"] = True if fw == "torch" else False trainer = run(config=config, env=env) policy = trainer.get_policy() vars = policy.get_weights() # Sample n actions, then roughly check their logp against their # counts. num_actions = 500 actions = [] for _ in range(num_actions): # Single action from single obs. actions.append( trainer.compute_action(obs_batch[0], prev_action=prev_a, prev_reward=prev_r, explore=True)) # Test 50 actions for their log-likelihoods vs expected values. if continuous: for idx in range(50): a = actions[idx] if fw == "tf" or fw == "eager": if isinstance(vars, list): expected_mean_logstd = fc( fc(obs_batch, vars[layer_key[1][0]]), vars[layer_key[1][1]]) else: expected_mean_logstd = fc( fc( obs_batch, vars["default_policy/{}_1/kernel".format( layer_key[0])]), vars["default_policy/{}_out/kernel".format( layer_key[0])]) else: expected_mean_logstd = fc( fc(obs_batch, vars["_hidden_layers.0._model.0.weight"]), vars["_logits._model.0.weight"]) mean, log_std = np.split(expected_mean_logstd, 2, axis=-1) if logp_func is None: expected_logp = np.log(norm.pdf(a, mean, np.exp(log_std))) else: expected_logp = logp_func(mean, log_std, a) logp = policy.compute_log_likelihoods( np.array([a]), preprocessed_obs_batch, prev_action_batch=np.array([prev_a]), prev_reward_batch=np.array([prev_r])) check(logp, expected_logp[0], rtol=0.2) # Test all available actions for their logp values. else: for a in [0, 1, 2, 3]: count = actions.count(a) expected_logp = np.log(count / num_actions) logp = policy.compute_log_likelihoods( np.array([a]), preprocessed_obs_batch, prev_action_batch=np.array([prev_a]), prev_reward_batch=np.array([prev_r])) check(logp, expected_logp, rtol=0.3)