def legacy_make_test_rewards( n_questions: int, n_rewards: int, true_reward: np.ndarray, epsilons: List[float], use_equiv: bool, ) -> Dict[float, Tuple[np.ndarray, np.ndarray]]: """ Generates n_rewards reward vectors and determines which are aligned. """ assert n_rewards > 0 assert_reward(true_reward, use_equiv) trajs = make_random_questions(n_questions, Driver()) _, normals = make_normals(trajs, Driver(), use_equiv) gt_pref = true_reward @ normals.T > 0 normals = orient_normals(normals, gt_pref, use_equiv) assert_normals(normals, use_equiv) n_reward_features = normals.shape[1] test_rewards: Dict[float, Tuple[np.ndarray, np.ndarray]] = {} for epsilon in epsilons: assert epsilon >= 0.0 cov = 1.0 rewards = make_gaussian_rewards(n_rewards, use_equiv, mean=true_reward, cov=cov) normals = normals[true_reward @ normals.T > epsilon] ground_truth_alignment = cast(np.ndarray, np.all(rewards @ normals.T > 0, axis=1)) mean_agree = np.mean(ground_truth_alignment) while mean_agree > 0.55 or mean_agree < 0.45: if mean_agree > 0.55: cov *= 1.1 else: cov /= 1.1 if not np.isfinite(cov) or cov <= 0.0 or cov >= 100.0: # TODO(joschnei): Break is a code smell logging.warning( f"cov={cov}, using last good batch of rewards.") break rewards = make_gaussian_rewards(n_rewards, use_equiv, mean=true_reward, cov=cov) normals = normals[true_reward @ normals.T > epsilon] ground_truth_alignment = cast( np.ndarray, np.all(rewards @ normals.T > 0, axis=1)) mean_agree = np.mean(ground_truth_alignment) assert ground_truth_alignment.shape == (n_rewards, ) assert rewards.shape == (n_rewards, n_reward_features) test_rewards[epsilon] = (rewards, ground_truth_alignment) return test_rewards
def load_elicitation( datadir: Path, normals_name: Union[str, Path], preferences_name: Union[str, Path], input_features_name: Union[str, Path], n_reward_features: int, use_equiv: bool, query_type: Optional[str] = None, equiv_probability: Optional[float] = None, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Loads and postprocesses elicitation.py output""" normals = np.load(datadir / normals_name) preferences = np.load(datadir / preferences_name) input_features = np.load(datadir / input_features_name) if use_equiv: assert equiv_probability is not None normals = add_equiv_constraints(preferences, normals, equiv_prob=equiv_probability) elif query_type == "weak": preferences, normals, input_features = remove_equiv( preferences, normals, input_features, ) assert_normals(normals, False, n_reward_features) assert_nonempty(normals, preferences, input_features) return normals, preferences, input_features
def run_test(normals: np.ndarray, test_rewards: np.ndarray, use_equiv: bool) -> np.ndarray: """ Returns the predicted alignment of the fake rewards by the normals. """ assert_normals(normals, use_equiv) results = cast(np.ndarray, np.all(np.dot(test_rewards, normals.T) > 0, axis=1)) return results
def test_orient_normals(actions: np.ndarray, reward: np.ndarray): reward = safe_normalize(reward) _, normals = make_normals(inputs=actions, sim=Driver(), use_equiv=False) value_diffs = reward @ normals.T prefs = value_diffs > 0 oriented_normals = orient_normals(normals, preferences=prefs) assert_normals(oriented_normals) assert np.all(reward @ oriented_normals.T == np.abs(value_diffs))
def main(datadir: Path) -> None: logging.basicConfig(level="INFO") datadir = Path(datadir) flags = pickle.load(open(datadir / "flags.pkl", "rb")) use_equiv = False sim = Driver() n_reward_features = sim.num_of_features inputs = np.load(datadir / "inputs.npy") n_questions = inputs.shape[0] assert inputs.shape[1] == 2 input_features = np.load(datadir / "input_features.npy") n_questions = input_features.shape[0] assert input_features.shape == (n_questions, 2, n_reward_features), input_features.shape assert_input_feature_consistency(inputs, input_features, sim) normals = np.load(datadir / "normals.npy") logging.info(f"There are {normals.shape[0]} questions") assert_normals(normals, use_equiv, n_reward_features) assert_normal_consistency(input_features, normals) preferences = np.load(datadir / "preferences.npy") assert preferences.shape == (n_questions,) assert np.all((preferences == 1) | (preferences == -1)) oriented_normals = orient_normals(normals, preferences) if (datadir / "true_reward.npy").exists(): true_reward = np.load(datadir / "true_reward.npy") assert_reward(true_reward, use_equiv, n_reward_features) logging.info(f"true_reward={true_reward}") assert_true_reward_consistency(oriented_normals, true_reward) if (datadir / "mean_reward.npy").exists(): mean_reward = np.load(datadir / "mean_reward.npy") logging.info(f"mean_reward={mean_reward}") assert_reward(mean_reward, use_equiv, n_reward_features) mean_accuracy = np.mean(oriented_normals @ mean_reward > 0) logging.info(f"Accuracy of mean reward function is {mean_accuracy}")
def simulated( epsilons: List[float] = [0.0], n_rewards: int = 100, human_samples: List[int] = [1], n_reward_samples: int = 1000, n_test_states: Optional[int] = None, n_gt_test_questions: int = 10000, traj_opt: bool = False, datadir: Path = Path(), outdir: Path = Path(), deltas: List[Optional[float]] = [None], use_mean_reward: bool = False, use_random_test_questions: bool = False, n_random_test_questions: Optional[int] = None, use_cheating_questions: bool = False, skip_remove_duplicates: bool = False, skip_epsilon_filtering: bool = False, skip_redundancy_filtering: bool = False, use_true_epsilon: bool = False, legacy_test_rewards: bool = False, replications: Optional[Union[str, Tuple[int, ...]]] = None, n_cpus: int = 1, overwrite_test_rewards: bool = False, overwrite_results: bool = False, verbosity: Literal["INFO", "DEBUG"] = "INFO", ) -> None: """ Evaluates alignment test generated by ground-truth rewards. """ logging.basicConfig(level=verbosity, format="%(levelname)s:%(asctime)s:%(message)s") if replications is not None: replication_indices = parse_replications(replications) for replication in replication_indices: if not (datadir / str(replication)).exists(): logging.warning( f"Replication {replication} does not exist, skipping") continue logging.info(f"Starting replication {replication}") simulated( epsilons=epsilons, deltas=deltas, n_rewards=n_rewards, human_samples=human_samples, n_reward_samples=n_reward_samples, n_test_states=n_test_states, n_gt_test_questions=n_gt_test_questions, datadir=datadir / str(replication), outdir=outdir / str(replication), use_mean_reward=use_mean_reward, use_random_test_questions=use_random_test_questions, use_cheating_questions=use_cheating_questions, n_random_test_questions=n_random_test_questions, skip_remove_duplicates=skip_remove_duplicates, skip_epsilon_filtering=skip_epsilon_filtering, skip_redundancy_filtering=skip_redundancy_filtering, use_true_epsilon=use_true_epsilon, legacy_test_rewards=legacy_test_rewards, n_cpus=n_cpus, overwrite_test_rewards=overwrite_test_rewards, overwrite_results=overwrite_results, verbosity=verbosity, ) exit() logging.info(f"Using {n_cpus} cpus.") parallel = Parallel(n_jobs=n_cpus) outdir.mkdir(parents=True, exist_ok=True) if n_random_test_questions is not None: # Argh defaults to parsing something as a string if its optional n_random_test_questions = int(n_random_test_questions) flags = pkl.load(open(datadir / flags_name, "rb")) query_type = flags["query_type"] equiv_probability = flags["equiv_size"] env = Driver() n_reward_features = env.num_of_features logging.info("Loading elicitation results") elicited_normals, elicited_preferences, elicited_input_features = load_elicitation( datadir=datadir, normals_name=normals_name, preferences_name=preferences_name, input_features_name=input_features_name, n_reward_features=n_reward_features, use_equiv=use_equiv, query_type=query_type, equiv_probability=equiv_probability, ) true_reward = np.load(datadir / true_reward_name) assert_reward(true_reward, False, n_reward_features) if use_equiv: true_reward = np.append(true_reward, [1]) else: assert not np.any(elicited_preferences == 0) factory = TestFactory( query_type=query_type, reward_dimension=elicited_normals.shape[1], equiv_probability=equiv_probability, n_reward_samples=n_reward_samples, use_mean_reward=use_mean_reward, skip_dedup=skip_remove_duplicates, skip_noise_filtering=True, skip_epsilon_filtering=skip_epsilon_filtering, skip_redundancy_filtering=skip_redundancy_filtering, use_true_epsilon=use_true_epsilon, true_reward=true_reward, ) logging.info(f"""Filtering settings: # reward samples={n_reward_samples}, use mean reward={use_mean_reward}, skip duplicates={skip_remove_duplicates} skip noise={True} skip epsilon={skip_epsilon_filtering} skip redundancy={skip_redundancy_filtering} use true epsilon={use_true_epsilon} """) confusion_path, test_path = make_outnames( outdir, skip_remove_duplicates, True, skip_epsilon_filtering, skip_redundancy_filtering, ) confusions: Dict[Experiment, np.ndarray] = load(confusion_path, overwrite_results, default={}) minimal_tests: Dict[Experiment, np.ndarray] = load(test_path, overwrite_results, default={}) experiments = make_experiments(epsilons, deltas, human_samples, overwrite_results, experiments=set(minimal_tests.keys())) if use_random_test_questions: logging.info("Making random test") logging.info(f"True reward: {true_reward}") normals, preferences, input_features = make_random_test( n_random_test_questions, elicited_input_features, elicited_preferences, reward_iterations=flags["reward_iterations"], query_type=query_type, equiv_size=flags["equiv_size"], sim=env, use_equiv=use_equiv, ) good_indices = (true_reward @ normals.T) > 0 logging.info( f"{np.mean(good_indices)*100:2f}% of new test questions agree with gt reward." ) if use_cheating_questions: logging.info(f"Selecting only questions consistent with gt reward") normals = normals[good_indices] preferences = preferences[good_indices] input_features = input_features[good_indices] assert_normals(normals, use_equiv) else: max_n = max(human_samples) preferences = elicited_preferences[:max_n] input_features = elicited_input_features[:max_n] logging.debug(f"elicited_normals={elicited_normals[:10]}") normals = orient_normals(elicited_normals[:max_n], preferences, use_equiv, n_reward_features) logging.debug(f"normals={normals[:10]}") assert np.all(true_reward @ normals.T >= 0) if not legacy_test_rewards: test_rewards = make_test_rewards( epsilons=epsilons, true_reward=true_reward, n_rewards=n_rewards, n_test_states=n_test_states, n_gt_test_questions=int(n_gt_test_questions), traj_opt=traj_opt, outdir=outdir, parallel=parallel, use_equiv=use_equiv, overwrite=overwrite_test_rewards, ) else: test_rewards = legacy_make_test_rewards(1000, n_rewards, true_reward, epsilons, use_equiv) for indices, confusion, experiment in parallel( delayed(run_gt_experiment)( normals=normals, test_rewards=test_rewards[epsilon][0], test_reward_alignment=test_rewards[epsilon][1], epsilon=epsilon, delta=delta, use_equiv=use_equiv, n_human_samples=n, factory=factory, input_features=input_features, preferences=preferences, outdir=outdir, verbosity=verbosity, ) for epsilon, delta, n in experiments): minimal_tests[experiment] = indices confusions[experiment] = confusion pkl.dump(confusions, open(confusion_path, "wb")) pkl.dump(minimal_tests, open(test_path, "wb"))
def test_make_normals(actions: np.ndarray): features, normals = make_normals(inputs=actions, sim=Driver(), use_equiv=False) assert np.all((features[0][0] - features[0][1]) == normals) assert_normals(normals)
def make_normals(input_features: np.ndarray) -> np.ndarray: normals = input_features[:, 0] - input_features[:, 1] assert_normals(normals, False, input_features.shape[2]) return normals