def get_debate_results( start_point, use_test_data, batch_size, N_samples, N_to_mask, judge_path, restricted_first, ): # MNISTJudge has to be imported here, because otherwise tensorflow does not # work together with multiprocessing from judge import MNISTJudge judge = MNISTJudge(N_to_mask=N_to_mask, model_dir=judge_path, binary_rewards=False) if use_test_data: dataset = judge.eval_data else: dataset = judge.train_data result_list = [] for i in range(batch_size): print("i", start_point + i, flush=True) t = time.time() if start_point + i > dataset.shape[0]: # end of dataset break results_per_label = np.zeros([10, 10]) for label in range(10): # print("label", label) sample = dataset[start_point + i] unrestricted_agent = DebateAgent(precommit_label=None, agentStrength=args.rollouts) restricted_agent = DebateAgent(precommit_label=label, agentStrength=args.rollouts) if restricted_first: agent1, agent2 = unrestricted_agent, restricted_agent else: agent1, agent2 = restricted_agent, unrestricted_agent debate = Debate((agent1, agent2), judge, N_to_mask, sample.flat) probabilities = debate.play(full_report=True) results_per_label[label] = probabilities result_list.append(results_per_label) print("time", time.time() - t) return result_list
def run( N_to_mask, sample_id, lying_agent_label, judge_path, dataset, rollouts, index_of_true_agent, binary_rewards, changing_sides, ): """ Runs one debate game on a given sample from a specified dataset (either "mnist" or "fashion") with N_to_mask rounds. The debate is modeled by MCTS with given number of rollouts. One player is the honest agents arguing for the correct label, the other either precommits to a lying_agent_label or debates without precommit if lying_agent_label is set to None. index_of_true_agent: Either 0 or 1 whether the honest agent plays first or second. binary_rewards: If set to True, rewards are either 1 or -1, if set False rewards lie between -1 and 1 changing_sides: If set to True, agents switch sides after each round, if set to False, the first agents reveales N_to_mask/2 features followed by N_to_mask/2 features of the second agent """ # Parse parameters if judge_path: path = judge_path elif dataset: path = "saved_models/" + dataset + str(N_to_mask) else: raise Exception("dataset must be specified") if dataset == "mnist": judge = MNISTJudge( N_to_mask=N_to_mask, model_dir=path, binary_rewards=binary_rewards ) elif dataset == "fashion": judge = FashionJudge( N_to_mask=N_to_mask, model_dir=path, binary_rewards=binary_rewards ) else: raise Exception("Unknown dataset in " + "dataset.txt: " + dataset) # Get debate data and run the debate sample = judge.eval_data[sample_id].flatten() label = judge.eval_labels[sample_id] if lying_agent_label == label: raise Exception("Lying label cannot be the same as the correct one") agent_lie = DebateAgent(precommit_label=lying_agent_label, agentStrength=rollouts) agent_truth = DebateAgent(precommit_label=label, agentStrength=rollouts) assert index_of_true_agent in [0, 1] player_description = DebatePlayers( agent_truth, agent_lie, index_of_true_agent, our_name="truth", opp_name="liar" ) debate = Debate( player_description.agents, judge, N_to_mask, sample, debug=True, changing_sides=changing_sides, ) utility = debate.play() player_description.print_debate_result(utility, label)
""" Used for testing the MNIST judge. Trains a judge for a (hardcoded) number of steps and prints out its accuracy. """ import numpy as np from judge import MNISTJudge if __name__ == "__main__": judge = MNISTJudge(4) img = judge.eval_data[0] img_flat = np.reshape(img, img.shape[0] * img.shape[1]) nonzero = np.where(img_flat > 0)[0] idx = np.random.choice(np.array(nonzero), 4) mask_flat = np.zeros_like(img_flat) mask_flat[idx] = 1 N_steps = 100 judge.train(N_steps) print(judge.evaluate_accuracy()) print( judge.evaluate_debate(np.stack((mask_flat, img_flat * mask_flat)), [0, 2]))
"--train-steps",type=int, help="Number of training steps. If more than 1, apply to corresponding index of n-zero" ) parser.add_argument( "--n-zero", nargs="*", type=float, default=[0], help="Number of 0-pixels to sample" ) parser.add_argument( "--path", type=str, help="Path to save the trained judge to (and restore from)" ) args = parser.parse_args() path = args.path or "saved_models/" + args.dataset + str(args.N_to_mask) if args.dataset == "mnist": judge = MNISTJudge(N_to_mask=args.N_to_mask, model_dir=path) elif args.dataset == "fashion": judge = FashionJudge(N_to_mask=args.N_to_mask, model_dir=path) else: raise Exception("Unknown dataset " + args.dataset) n_zero = args.n_zero if len(args.n_zero) > 1 else args.n_zero[0] t = time.time() if args.train_steps == 0: print('Received 0 steps. Will not train.') else: judge.train(args.train_steps, n_zero) print('Time', time.time() - t) print('Accuracy', judge.evaluate_accuracy(n_zero)) for i in range(args.N_to_mask + 1): print('Accuracy', i, 'black pixels', judge.evaluate_accuracy(i))
from judge import MNISTJudge if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--N-to-mask", type=int, help="Number of features revealed as an input", required=True, ) parser.add_argument( "--file", type=str, required=True, help="File containing debate results" ) args = parser.parse_args() judge = MNISTJudge(N_to_mask=args.N_to_mask) train_labels = judge.train_labels truth_wins_count, restricted_wins_count = 0, 0 debate_results = np.fromfile(args.file).reshape(-1, 10, 10) n_samples = debate_results.shape[0] for i in range(n_samples): true_label = train_labels[i] for label in range(10): judge_probabilities = debate_results[i, label] if np.all(judge_probabilities[label] >= judge_probabilities): restricted_wins_count += 1 if label == true_label: truth_wins_count += 1 print("\n-------------------------------")
""" Minor script used for debugging. The purpose is to ensure the judge's 'predictor' and 'estimator' yield the same prediction accuracy (because they are supposed to contain the same model). """ from judge import MNISTJudge, FashionJudge if __name__ == "__main__": dataset = "mnist" judge_path = "judge_mnist_4" N_to_mask = 4 if judge_path: path = judge_path elif dataset: path = "saved_models/" + dataset + str(N_to_mask) else: raise Exception("Either judge_path or dataset needs to be specified") if dataset == "mnist": judge = MNISTJudge(N_to_mask=N_to_mask, model_dir=path) elif dataset == "fashion": judge = FashionJudge(N_to_mask=N_to_mask, model_dir=path) else: raise Exception("Unknown dataset in " + "dataset.txt: " + dataset) print("accuracy estimator", judge.evaluate_accuracy()) print("accuracy predictor", judge.evaluate_accuracy_using_predictor())
def run( N_to_mask, judge_path, dataset, nmbr_samples, start_at_sample, eval_unrestricted, rollouts, index_of_truth_agent, changing_sides, compute_confusion_matrix, precom_eval_seeds, image_directory, N_images_to_save, allow_black_pixels, ): """ Evaluates debate on a given number of samples of a given dataset ("mnist", "fashion"). Each debate has N_to_mask rounds. The debate is either modeled with precommit or unrestricted given the eval_unrestricted parameter. The precommited debate is evaluated by the way described in the "AI safety via debate" paper, the unrestricted debate is played once for each sample. index_of_truth_agent: Either 0 or 1 whether the honest agent plays first or second. changing_sides: If set to True, agents switch sides after each move, if set to False, the first agents reveales N_to_mask/2 features followed by N_to_mask/2 features of the second agent compute_confusion_matrix: If True, compute confusion matrix as in figure 3 in the AISvD paper. Only for restricted debate. """ # Parse parameters if judge_path: path = judge_path elif dataset: path = "saved_models/" + dataset + str(N_to_mask) else: raise Exception("Either judge_path or dataset needs to be specified") if dataset == "mnist": judge = MNISTJudge(N_to_mask=N_to_mask, model_dir=path) elif dataset == "fashion": judge = FashionJudge(N_to_mask=N_to_mask, model_dir=path) else: raise Exception("Unknown dataset in " + "dataset.txt: " + dataset) if not nmbr_samples: nmbr_samples = len(judge.eval_data) if (precom_eval_seeds % 2) != 1: raise Exception( "Number of seeds to evaluate the precommited debate must be odd") assert image_directory or not N_images_to_save print("Parameters") print("--------") print("N_to_mask:", N_to_mask) print("judge_path:", judge_path) print("dataset:", dataset) print("nmbr_samples:", nmbr_samples) print("start_at_sample:", start_at_sample) print("eval_unrestricted:", eval_unrestricted) print("rollouts:", rollouts) print("index_of_truth_agent:", index_of_truth_agent) print("changing_sides:", changing_sides) print("compute_confusion_matrix:", compute_confusion_matrix) print("precom_eval_seeds:", precom_eval_seeds) print("image_directory:", image_directory) print("N_images_to_save:", N_images_to_save) print("allow_black_pixels:", allow_black_pixels) print("--------") judge_accuracy = judge.evaluate_accuracy() print("Judge accuracy:", judge_accuracy) print("--------", flush=True) # Prepare for confusion # confusion_matrix_counter[true_label][lying_label] specifies the number of times 'lying_label' won over 'true_label' # labels_frequency counts the occurancies of each label if compute_confusion_matrix: if eval_unrestricted: raise Exception( "Consusion matrix can be computed only while evaluating restricted debate" ) confusion_matrix_counter = np.array([[0 for i in range(10)] for i in range(10)], dtype='f') labels_frequency = [0 for i in range(10)] else: confusion_matrix_counter = None # Run debate for each sample overall_truth_win_count = 0 for sample_id in range(start_at_sample, start_at_sample + nmbr_samples): sample_start_time = time.time() sample = judge.eval_data[sample_id].flatten() label = judge.eval_labels[sample_id] save_img = image_directory and ( N_images_to_save is None or sample_id - start_at_sample < N_images_to_save) # Reproduce the experiment from AI safety via debate paper if not eval_unrestricted: if save_img: dirname = image_directory + '/img' + str(sample_id + 1) + '/' makedirs(dirname, exist_ok=True) else: dirname = None truth_won = evaluate_sample_restricted( N_to_mask, sample, label, judge, rollouts, index_of_truth_agent, changing_sides, allow_black_pixels, precom_eval_seeds, confusion_matrix_counter, dirname) if compute_confusion_matrix: labels_frequency[label] += 1 # Evaluate unrestricted debate (without precommit) else: if save_img: makedirs(image_directory, exist_ok=True) filename = image_directory + '/img' + str(sample_id + 1) else: filename = None truth_won = evaluate_sample_unrestricted(N_to_mask, sample, label, judge, rollouts, index_of_truth_agent, changing_sides, allow_black_pixels, filename) print("\t Sample {}".format(sample_id + 1), end=" ", flush=True) if truth_won: overall_truth_win_count += 1 print("Winner: Truth.", end=" ", flush=True) else: print("Winner: Liar.", end=" ", flush=True) print( "Truth winrate: {} out of {} ({}%)".format( overall_truth_win_count, sample_id - start_at_sample + 1, 100 * overall_truth_win_count / (sample_id - start_at_sample + 1), ), flush=True, ) print("\t Sample time: {}".format(time.time() - sample_start_time)) print( "Overall truth winrate: {} out of {} ({}%)".format( overall_truth_win_count, nmbr_samples, 100 * overall_truth_win_count / nmbr_samples, ), flush=True, ) if compute_confusion_matrix: build_confusion_matrix(confusion_matrix_counter, labels_frequency, dataset, show_matrix=False)
""" Used for testing the debate classifier. Trains it on MNIST with the true labels and evaluates it's accuracy. Only used for testing. """ import numpy as np from judge import MNISTJudge from agent import DebateClassifier if __name__ == "__main__": judge = MNISTJudge(4) train_data = judge.train_data train_labels = judge.train_labels eval_data = judge.eval_data eval_labels = judge.eval_labels debate_classifier = DebateClassifier() for i in range(100): batch_start = (i * 128) % len(train_data) batch_end = min(batch_start + 128, len(train_data)) batch = train_data[batch_start:batch_end] labels = train_labels[batch_start:batch_end] loss_weights = np.ones_like(labels, dtype=np.float32) debate_classifier.train(batch, labels, loss_weights) acc = debate_classifier.evaluate_accuracy(eval_data, eval_labels) print("Accuracy", acc)
def run( N_to_mask, judge_path, dataset, rollouts, N_epochs, batch_size, learning_rate, learning_rate_decay, classifier_path, cheat_debate, only_update_for_wins, precomputed_debate_results_restricted_first_path, precomputed_debate_results_restricted_second_path, shuffle_batches, use_dropout, importance_sampling_weights, importance_sampling_cap, ): if judge_path: path = judge_path elif dataset: path = "saved_models/" + dataset + str(N_to_mask) else: raise Exception("Either judge_path or dataset needs to be specified") if dataset == "mnist": judge = MNISTJudge(N_to_mask=N_to_mask, model_dir=path) elif dataset == "fashion": judge = FashionJudge(N_to_mask=N_to_mask, model_dir=path) else: raise Exception("Unknown dataset in " + "dataset.txt: " + dataset) judge_accuracy = judge.evaluate_accuracy() print("Judge accuracy:", judge_accuracy) if precomputed_debate_results_restricted_first_path is not None: assert precomputed_debate_results_restricted_second_path is not None if cheat_debate: raise Exception( "cheat_debate should not be enabled when training " "from precomputed debate results" ) debate_results_restricted_first = np.fromfile( precomputed_debate_results_restricted_first_path ).reshape(-1, 10, 10) debate_results_restricted_second = np.fromfile( precomputed_debate_results_restricted_second_path ).reshape(-1, 10, 10) print( "Loaded debate results from {} and {}".format( precomputed_debate_results_restricted_first_path, precomputed_debate_results_restricted_second_path, ) ) print("These will be used for training instead of re-running the debates.") else: debate_results_restricted_first, debate_results_restricted_second = None, None train_data = judge.train_data N_train = len(judge.train_labels) eval_data = judge.eval_data eval_labels = judge.eval_labels debate_classifier = DebateClassifier( learning_rate=learning_rate, learning_rate_decay=learning_rate_decay, model_dir=classifier_path, use_dropout=use_dropout, ) batch_samples = [] batch_labels = [] batch_weights = [] t = time.time() for epoch in range(N_epochs): for i in range(N_train): # print(i, flush=True) sample = train_data[i] probs = next(debate_classifier.predict(sample))["probabilities"] label = np.random.choice(range(len(probs)), p=probs) restricted_first = np.random.random() < 0.5 if cheat_debate: # simulate a perfectly accurate debate if label == judge.train_labels[i]: weight = 1 elif only_update_for_wins: weight = 0 else: weight = -0.1 elif debate_results_restricted_first is not None: assert debate_results_restricted_second is not None if restricted_first: debate_results = debate_results_restricted_first else: debate_results = debate_results_restricted_second # use precomputed results judge_probabilities = debate_results[i, label] if np.all(judge_probabilities[label] >= judge_probabilities): weight = 1 elif only_update_for_wins: weight = 0 else: weight = -0.1 else: # run non-precommited debate agent_unrestricted = DebateAgent( precommit_label=None, agentStrength=rollouts ) agent_restricted = DebateAgent( precommit_label=label, agentStrength=rollouts ) if restricted_first: agent1, agent2 = agent_restricted, agent_unrestricted else: agent1, agent2 = agent_unrestricted, agent_restricted debate = Debate((agent1, agent2), judge, N_to_mask, sample.flat) utility = debate.play() if (utility == 1 and restricted_first) or ( utility == -1 and not restricted_first ): weight = 1 elif only_update_for_wins: weight = 0 else: weight = -0.1 if importance_sampling_weights: importance_sampling_factor = 1 / probs[label] if ( importance_sampling_cap is not None and importance_sampling_factor > importance_sampling_cap ): importance_sampling_factor = importance_sampling_cap weight *= importance_sampling_factor # print("weight", weight) batch_samples.append(sample) batch_labels.append(label) batch_weights.append(weight) if (i + 1) % batch_size == 0 or i == N_train - 1: # update debate classifier print("i", i, flush=True) print("batch_weights", batch_weights, flush=True) debate_classifier.train( np.array(batch_samples), np.array(batch_labels), np.array(batch_weights), shuffle=shuffle_batches, ) acc = debate_classifier.evaluate_accuracy(eval_data, eval_labels) print("Updated debate_classifier", flush=True) print("Evaluation accuracy", acc, flush=True) t2 = time.time() print("Batch time ", t2 - t) t = t2 batch_samples = [] batch_labels = [] batch_weights = [] acc = debate_classifier.evaluate_accuracy(eval_data, eval_labels) print("Accuracy", acc, flush=True)
def test_mnist_judge(): # dummy test, would make sense to write more at some point judge = MNISTJudge(N_to_mask=700) assert judge.N_to_mask == 700