def make_random_test( n_random_test_questions: Optional[int], elicited_input_features: np.ndarray, elicited_preferences: np.ndarray, reward_iterations: int, query_type: str, equiv_size: float, sim, use_equiv: bool, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Generates an alignment test of randomly generated questions answered according to the mean posterior reward. """ if n_random_test_questions is None: raise ValueError( "Must supply n_random_test_questions if use_random_test_questions is true." ) mean_reward = get_mean_reward( elicited_input_features, elicited_preferences, reward_iterations, query_type, equiv_size, ) logging.info( f"Mean posterior reward for use in random test: {mean_reward}") inputs = make_random_questions(n_random_test_questions, sim) input_features, normals = make_normals(inputs, sim, use_equiv) preferences = normals @ mean_reward > 0 assert preferences.shape == (normals.shape[0], ) normals = orient_normals(normals, preferences) return normals, preferences, input_features
def legacy_make_test_rewards( n_questions: int, n_rewards: int, true_reward: np.ndarray, epsilons: List[float], use_equiv: bool, ) -> Dict[float, Tuple[np.ndarray, np.ndarray]]: """ Generates n_rewards reward vectors and determines which are aligned. """ assert n_rewards > 0 assert_reward(true_reward, use_equiv) trajs = make_random_questions(n_questions, Driver()) _, normals = make_normals(trajs, Driver(), use_equiv) gt_pref = true_reward @ normals.T > 0 normals = orient_normals(normals, gt_pref, use_equiv) assert_normals(normals, use_equiv) n_reward_features = normals.shape[1] test_rewards: Dict[float, Tuple[np.ndarray, np.ndarray]] = {} for epsilon in epsilons: assert epsilon >= 0.0 cov = 1.0 rewards = make_gaussian_rewards(n_rewards, use_equiv, mean=true_reward, cov=cov) normals = normals[true_reward @ normals.T > epsilon] ground_truth_alignment = cast(np.ndarray, np.all(rewards @ normals.T > 0, axis=1)) mean_agree = np.mean(ground_truth_alignment) while mean_agree > 0.55 or mean_agree < 0.45: if mean_agree > 0.55: cov *= 1.1 else: cov /= 1.1 if not np.isfinite(cov) or cov <= 0.0 or cov >= 100.0: # TODO(joschnei): Break is a code smell logging.warning( f"cov={cov}, using last good batch of rewards.") break rewards = make_gaussian_rewards(n_rewards, use_equiv, mean=true_reward, cov=cov) normals = normals[true_reward @ normals.T > epsilon] ground_truth_alignment = cast( np.ndarray, np.all(rewards @ normals.T > 0, axis=1)) mean_agree = np.mean(ground_truth_alignment) assert ground_truth_alignment.shape == (n_rewards, ) assert rewards.shape == (n_rewards, n_reward_features) test_rewards[epsilon] = (rewards, ground_truth_alignment) return test_rewards
def test_orient_normals(actions: np.ndarray, reward: np.ndarray): reward = safe_normalize(reward) _, normals = make_normals(inputs=actions, sim=Driver(), use_equiv=False) value_diffs = reward @ normals.T prefs = value_diffs > 0 oriented_normals = orient_normals(normals, preferences=prefs) assert_normals(oriented_normals) assert np.all(reward @ oriented_normals.T == np.abs(value_diffs))
def main(datadir: Path) -> None: logging.basicConfig(level="INFO") datadir = Path(datadir) flags = pickle.load(open(datadir / "flags.pkl", "rb")) use_equiv = False sim = Driver() n_reward_features = sim.num_of_features inputs = np.load(datadir / "inputs.npy") n_questions = inputs.shape[0] assert inputs.shape[1] == 2 input_features = np.load(datadir / "input_features.npy") n_questions = input_features.shape[0] assert input_features.shape == (n_questions, 2, n_reward_features), input_features.shape assert_input_feature_consistency(inputs, input_features, sim) normals = np.load(datadir / "normals.npy") logging.info(f"There are {normals.shape[0]} questions") assert_normals(normals, use_equiv, n_reward_features) assert_normal_consistency(input_features, normals) preferences = np.load(datadir / "preferences.npy") assert preferences.shape == (n_questions,) assert np.all((preferences == 1) | (preferences == -1)) oriented_normals = orient_normals(normals, preferences) if (datadir / "true_reward.npy").exists(): true_reward = np.load(datadir / "true_reward.npy") assert_reward(true_reward, use_equiv, n_reward_features) logging.info(f"true_reward={true_reward}") assert_true_reward_consistency(oriented_normals, true_reward) if (datadir / "mean_reward.npy").exists(): mean_reward = np.load(datadir / "mean_reward.npy") logging.info(f"mean_reward={mean_reward}") assert_reward(mean_reward, use_equiv, n_reward_features) mean_accuracy = np.mean(oriented_normals @ mean_reward > 0) logging.info(f"Accuracy of mean reward function is {mean_accuracy}")
def main( datadir=Path("volume-data/questions"), n_replications=10, reward_name=Path("true_reward.npy"), questions_name=Path("normals.npy"), preferences_name=Path("preferences.npy"), ): gaps = None for replication in range(1, 1 + n_replications): reward = np.load(datadir / str(replication) / reward_name) questions = np.load(datadir / str(replication) / questions_name) preferences = np.load(datadir / str(replication) / preferences_name) normals = orient_normals(questions, preferences) true_value_gap = normals @ reward gaps = np.concatenate( (gaps, true_value_gap)) if gaps is not None else true_value_gap print(f"Mean reward gap: {np.mean(gaps)} ({np.std(gaps)})") print( f"20/80th percentiles: {np.percentile(gaps, 5)}, {np.percentile(gaps, 95)}" )
def make_gt_test_align( test_rewards: np.ndarray, n_questions: int, true_reward: np.ndarray, epsilon: float, use_equiv: bool = False, ) -> np.ndarray: env = Driver() trajs = make_random_questions(n_questions, env) _, normals = make_normals(trajs, env, use_equiv) value_diff = true_reward @ normals.T eps_questions = np.abs(value_diff) > epsilon normals = normals[eps_questions] gt_pref = value_diff[eps_questions] > 0 normals = orient_normals(normals, gt_pref, use_equiv) alignment = cast(np.ndarray, np.all(test_rewards @ normals.T > 0, axis=1)) assert alignment.shape == ( test_rewards.shape[0], ), f"alignment shape={alignment.shape} is not expected {test_rewards.shape[0]}" return alignment
def simulated( epsilons: List[float] = [0.0], n_rewards: int = 100, human_samples: List[int] = [1], n_reward_samples: int = 1000, n_test_states: Optional[int] = None, n_gt_test_questions: int = 10000, traj_opt: bool = False, datadir: Path = Path(), outdir: Path = Path(), deltas: List[Optional[float]] = [None], use_mean_reward: bool = False, use_random_test_questions: bool = False, n_random_test_questions: Optional[int] = None, use_cheating_questions: bool = False, skip_remove_duplicates: bool = False, skip_epsilon_filtering: bool = False, skip_redundancy_filtering: bool = False, use_true_epsilon: bool = False, legacy_test_rewards: bool = False, replications: Optional[Union[str, Tuple[int, ...]]] = None, n_cpus: int = 1, overwrite_test_rewards: bool = False, overwrite_results: bool = False, verbosity: Literal["INFO", "DEBUG"] = "INFO", ) -> None: """ Evaluates alignment test generated by ground-truth rewards. """ logging.basicConfig(level=verbosity, format="%(levelname)s:%(asctime)s:%(message)s") if replications is not None: replication_indices = parse_replications(replications) for replication in replication_indices: if not (datadir / str(replication)).exists(): logging.warning( f"Replication {replication} does not exist, skipping") continue logging.info(f"Starting replication {replication}") simulated( epsilons=epsilons, deltas=deltas, n_rewards=n_rewards, human_samples=human_samples, n_reward_samples=n_reward_samples, n_test_states=n_test_states, n_gt_test_questions=n_gt_test_questions, datadir=datadir / str(replication), outdir=outdir / str(replication), use_mean_reward=use_mean_reward, use_random_test_questions=use_random_test_questions, use_cheating_questions=use_cheating_questions, n_random_test_questions=n_random_test_questions, skip_remove_duplicates=skip_remove_duplicates, skip_epsilon_filtering=skip_epsilon_filtering, skip_redundancy_filtering=skip_redundancy_filtering, use_true_epsilon=use_true_epsilon, legacy_test_rewards=legacy_test_rewards, n_cpus=n_cpus, overwrite_test_rewards=overwrite_test_rewards, overwrite_results=overwrite_results, verbosity=verbosity, ) exit() logging.info(f"Using {n_cpus} cpus.") parallel = Parallel(n_jobs=n_cpus) outdir.mkdir(parents=True, exist_ok=True) if n_random_test_questions is not None: # Argh defaults to parsing something as a string if its optional n_random_test_questions = int(n_random_test_questions) flags = pkl.load(open(datadir / flags_name, "rb")) query_type = flags["query_type"] equiv_probability = flags["equiv_size"] env = Driver() n_reward_features = env.num_of_features logging.info("Loading elicitation results") elicited_normals, elicited_preferences, elicited_input_features = load_elicitation( datadir=datadir, normals_name=normals_name, preferences_name=preferences_name, input_features_name=input_features_name, n_reward_features=n_reward_features, use_equiv=use_equiv, query_type=query_type, equiv_probability=equiv_probability, ) true_reward = np.load(datadir / true_reward_name) assert_reward(true_reward, False, n_reward_features) if use_equiv: true_reward = np.append(true_reward, [1]) else: assert not np.any(elicited_preferences == 0) factory = TestFactory( query_type=query_type, reward_dimension=elicited_normals.shape[1], equiv_probability=equiv_probability, n_reward_samples=n_reward_samples, use_mean_reward=use_mean_reward, skip_dedup=skip_remove_duplicates, skip_noise_filtering=True, skip_epsilon_filtering=skip_epsilon_filtering, skip_redundancy_filtering=skip_redundancy_filtering, use_true_epsilon=use_true_epsilon, true_reward=true_reward, ) logging.info(f"""Filtering settings: # reward samples={n_reward_samples}, use mean reward={use_mean_reward}, skip duplicates={skip_remove_duplicates} skip noise={True} skip epsilon={skip_epsilon_filtering} skip redundancy={skip_redundancy_filtering} use true epsilon={use_true_epsilon} """) confusion_path, test_path = make_outnames( outdir, skip_remove_duplicates, True, skip_epsilon_filtering, skip_redundancy_filtering, ) confusions: Dict[Experiment, np.ndarray] = load(confusion_path, overwrite_results, default={}) minimal_tests: Dict[Experiment, np.ndarray] = load(test_path, overwrite_results, default={}) experiments = make_experiments(epsilons, deltas, human_samples, overwrite_results, experiments=set(minimal_tests.keys())) if use_random_test_questions: logging.info("Making random test") logging.info(f"True reward: {true_reward}") normals, preferences, input_features = make_random_test( n_random_test_questions, elicited_input_features, elicited_preferences, reward_iterations=flags["reward_iterations"], query_type=query_type, equiv_size=flags["equiv_size"], sim=env, use_equiv=use_equiv, ) good_indices = (true_reward @ normals.T) > 0 logging.info( f"{np.mean(good_indices)*100:2f}% of new test questions agree with gt reward." ) if use_cheating_questions: logging.info(f"Selecting only questions consistent with gt reward") normals = normals[good_indices] preferences = preferences[good_indices] input_features = input_features[good_indices] assert_normals(normals, use_equiv) else: max_n = max(human_samples) preferences = elicited_preferences[:max_n] input_features = elicited_input_features[:max_n] logging.debug(f"elicited_normals={elicited_normals[:10]}") normals = orient_normals(elicited_normals[:max_n], preferences, use_equiv, n_reward_features) logging.debug(f"normals={normals[:10]}") assert np.all(true_reward @ normals.T >= 0) if not legacy_test_rewards: test_rewards = make_test_rewards( epsilons=epsilons, true_reward=true_reward, n_rewards=n_rewards, n_test_states=n_test_states, n_gt_test_questions=int(n_gt_test_questions), traj_opt=traj_opt, outdir=outdir, parallel=parallel, use_equiv=use_equiv, overwrite=overwrite_test_rewards, ) else: test_rewards = legacy_make_test_rewards(1000, n_rewards, true_reward, epsilons, use_equiv) for indices, confusion, experiment in parallel( delayed(run_gt_experiment)( normals=normals, test_rewards=test_rewards[epsilon][0], test_reward_alignment=test_rewards[epsilon][1], epsilon=epsilon, delta=delta, use_equiv=use_equiv, n_human_samples=n, factory=factory, input_features=input_features, preferences=preferences, outdir=outdir, verbosity=verbosity, ) for epsilon, delta, n in experiments): minimal_tests[experiment] = indices confusions[experiment] = confusion pkl.dump(confusions, open(confusion_path, "wb")) pkl.dump(minimal_tests, open(test_path, "wb"))