def legacy_make_test_rewards( n_questions: int, n_rewards: int, true_reward: np.ndarray, epsilons: List[float], use_equiv: bool, ) -> Dict[float, Tuple[np.ndarray, np.ndarray]]: """ Generates n_rewards reward vectors and determines which are aligned. """ assert n_rewards > 0 assert_reward(true_reward, use_equiv) trajs = make_random_questions(n_questions, Driver()) _, normals = make_normals(trajs, Driver(), use_equiv) gt_pref = true_reward @ normals.T > 0 normals = orient_normals(normals, gt_pref, use_equiv) assert_normals(normals, use_equiv) n_reward_features = normals.shape[1] test_rewards: Dict[float, Tuple[np.ndarray, np.ndarray]] = {} for epsilon in epsilons: assert epsilon >= 0.0 cov = 1.0 rewards = make_gaussian_rewards(n_rewards, use_equiv, mean=true_reward, cov=cov) normals = normals[true_reward @ normals.T > epsilon] ground_truth_alignment = cast(np.ndarray, np.all(rewards @ normals.T > 0, axis=1)) mean_agree = np.mean(ground_truth_alignment) while mean_agree > 0.55 or mean_agree < 0.45: if mean_agree > 0.55: cov *= 1.1 else: cov /= 1.1 if not np.isfinite(cov) or cov <= 0.0 or cov >= 100.0: # TODO(joschnei): Break is a code smell logging.warning( f"cov={cov}, using last good batch of rewards.") break rewards = make_gaussian_rewards(n_rewards, use_equiv, mean=true_reward, cov=cov) normals = normals[true_reward @ normals.T > epsilon] ground_truth_alignment = cast( np.ndarray, np.all(rewards @ normals.T > 0, axis=1)) mean_agree = np.mean(ground_truth_alignment) assert ground_truth_alignment.shape == (n_rewards, ) assert rewards.shape == (n_rewards, n_reward_features) test_rewards[epsilon] = (rewards, ground_truth_alignment) return test_rewards
def make_random_test( n_random_test_questions: Optional[int], elicited_input_features: np.ndarray, elicited_preferences: np.ndarray, reward_iterations: int, query_type: str, equiv_size: float, sim, use_equiv: bool, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Generates an alignment test of randomly generated questions answered according to the mean posterior reward. """ if n_random_test_questions is None: raise ValueError( "Must supply n_random_test_questions if use_random_test_questions is true." ) mean_reward = get_mean_reward( elicited_input_features, elicited_preferences, reward_iterations, query_type, equiv_size, ) logging.info( f"Mean posterior reward for use in random test: {mean_reward}") inputs = make_random_questions(n_random_test_questions, sim) input_features, normals = make_normals(inputs, sim, use_equiv) preferences = normals @ mean_reward > 0 assert preferences.shape == (normals.shape[0], ) normals = orient_normals(normals, preferences) return normals, preferences, input_features
def test_orient_normals(actions: np.ndarray, reward: np.ndarray): reward = safe_normalize(reward) _, normals = make_normals(inputs=actions, sim=Driver(), use_equiv=False) value_diffs = reward @ normals.T prefs = value_diffs > 0 oriented_normals = orient_normals(normals, preferences=prefs) assert_normals(oriented_normals) assert np.all(reward @ oriented_normals.T == np.abs(value_diffs))
def make_gt_test_align( test_rewards: np.ndarray, n_questions: int, true_reward: np.ndarray, epsilon: float, use_equiv: bool = False, ) -> np.ndarray: env = Driver() trajs = make_random_questions(n_questions, env) _, normals = make_normals(trajs, env, use_equiv) value_diff = true_reward @ normals.T eps_questions = np.abs(value_diff) > epsilon normals = normals[eps_questions] gt_pref = value_diff[eps_questions] > 0 normals = orient_normals(normals, gt_pref, use_equiv) alignment = cast(np.ndarray, np.all(test_rewards @ normals.T > 0, axis=1)) assert alignment.shape == ( test_rewards.shape[0], ), f"alignment shape={alignment.shape} is not expected {test_rewards.shape[0]}" return alignment
def test_make_normals(actions: np.ndarray): features, normals = make_normals(inputs=actions, sim=Driver(), use_equiv=False) assert np.all((features[0][0] - features[0][1]) == normals) assert_normals(normals)