def test_success(self): nd_q1_t = np.random.random((4, 1)) nd_q2_t = np.random.random((4, 1)) q1_t = to_tf(nd_q1_t) q2_t = to_tf(nd_q2_t) loss = build_actor_loss(q1_t, q2_t) with self.test_session() as sess: nd_loss = np.mean(np.min(np.hstack([nd_q1_t, nd_q2_t]), axis=1)) assert np.allclose(sess.run(loss), nd_loss)
def test_success(self): nd_q1_t = np.random.random((4, 1)) nd_q2_t = np.random.random((4, 1)) nd_log_prob_t = np.random.random((4, 1)) q1_t = to_tf(nd_q1_t) q2_t = to_tf(nd_q2_t) log_prob_t = to_tf(nd_log_prob_t) loss = build_pi_loss(log_prob_t, q1_t, q2_t) with self.test_session() as sess: q_t = np.minimum(nd_q1_t, nd_q2_t) answer = np.mean(nd_log_prob_t - q_t) assert np.allclose(sess.run(loss), answer)
def test_success_with_negative_clipped(self): epsilon = np.random.random() nd_old_values = np.random.random((4, 1)) nd_values = nd_old_values - epsilon * 1.1 nd_returns = -np.random.random((4, 1)) values = to_tf(nd_values) returns = to_tf(nd_returns) old_values = to_tf(nd_old_values) loss = build_value_loss(values, returns, old_values, epsilon, 0.5) with self.test_session() as sess: answer = 0.5 * np.mean((nd_returns - (nd_old_values - epsilon)) ** 2) assert np.allclose(sess.run(loss), answer)
def test_success(self): nd_q1_t = np.random.random((4, 1)) nd_q2_t = np.random.random((4, 1)) nd_target = np.random.random((4, 1)) q1_t = to_tf(nd_q1_t) q2_t = to_tf(nd_q2_t) target = to_tf(nd_target) loss = build_critic_loss(q1_t, q2_t, target) with self.test_session() as sess: nd_loss = np.mean((nd_target - nd_q1_t)**2) + np.mean( (nd_target - nd_q2_t)**2) assert np.allclose(sess.run(loss), nd_loss)
def test_success_with_unclipped(self): epsilon = np.random.random() nd_returns = np.random.random((4, 1)) nd_old_values = np.random.random((4, 1)) nd_values = nd_old_values + epsilon * 0.9 values = to_tf(nd_values) returns = to_tf(nd_returns) old_values = to_tf(nd_old_values) loss = build_value_loss(values, returns, old_values, epsilon, 0.5) assert len(loss.shape) == 0 with self.test_session() as sess: answer = 0.5 * np.mean((nd_returns - nd_values) ** 2) assert np.allclose(sess.run(loss), answer)
def test_success(self): nd_v_t = np.random.random((4, 1)) nd_q1_t = np.random.random((4, 1)) nd_q2_t = np.random.random((4, 1)) nd_log_prob_t = np.random.random((4, 1)) v_t = to_tf(nd_v_t) q1_t = to_tf(nd_q1_t) q2_t = to_tf(nd_q2_t) log_prob_t = to_tf(nd_log_prob_t) loss = build_v_loss(v_t, q1_t, q2_t, log_prob_t) with self.test_session() as sess: nd_q_t = np.minimum(nd_q1_t, nd_q2_t) answer = 0.5 * np.mean((nd_v_t - (nd_q_t - nd_log_prob_t))**2) assert np.allclose(sess.run(loss), answer)
def test_success(self): nd_q_t = np.random.random((4, 1)) nd_rewards_tp1 = np.random.random((4, 1)) nd_q_tp1 = np.random.random((4, 1)) nd_dones_tp1 = np.random.randint(2, size=(4, 1)) q_t = to_tf(nd_q_t) rewards_tp1 = to_tf(nd_rewards_tp1) q_tp1 = to_tf(nd_q_tp1) dones_tp1 = to_tf(nd_dones_tp1) gamma = np.random.random() loss = build_critic_loss(q_t, rewards_tp1, q_tp1, dones_tp1, gamma) with self.test_session() as sess: target = nd_rewards_tp1 + gamma * nd_q_tp1 * (1.0 - nd_dones_tp1) answer = np.mean((target - nd_q_t)**2) assert np.allclose(sess.run(loss), answer)
def test_success_with_positive_not_clipped(self): # new/old < 0.5 nd_log_probs = np.log(np.random.random((4, 1)) * 0.2) nd_old_log_probs = np.log(np.random.random((4, 1)) * 0.5 + 0.5) nd_advantages = np.random.random((4, 1)) log_probs = to_tf(nd_log_probs) old_log_probs = to_tf(nd_old_log_probs) advantages = to_tf(nd_advantages) loss = build_policy_loss(log_probs, old_log_probs, advantages, 0.2) ratio = np.exp(nd_log_probs - nd_old_log_probs) answer = -np.mean(ratio * nd_advantages) with self.test_session() as sess: assert np.allclose(sess.run(loss), answer)
def test_success(self): nd_rewards_tp1 = np.random.random((4, 1)) nd_q1_tp1 = np.random.random((4, 1)) nd_q2_tp1 = np.random.random((4, 1)) nd_dones_tp1 = np.random.randint(2, size=(4, 1)) gamma = np.random.random() rewards_tp1 = to_tf(nd_rewards_tp1) q1_tp1 = to_tf(nd_q1_tp1) q2_tp1 = to_tf(nd_q2_tp1) dones_tp1 = to_tf(nd_dones_tp1) target = build_target(rewards_tp1, q1_tp1, q2_tp1, dones_tp1, gamma) with self.test_session() as sess: q_tp1 = np.reshape( np.min(np.hstack([nd_q1_tp1, nd_q2_tp1]), axis=1), (-1, 1)) nd_target = nd_rewards_tp1 + gamma * q_tp1 * (1.0 - nd_dones_tp1) assert np.allclose(sess.run(target), nd_target)