""" Used for testing the MNIST judge. Trains a judge for a (hardcoded) number of steps and prints out its accuracy. """ import numpy as np from judge import MNISTJudge if __name__ == "__main__": judge = MNISTJudge(4) img = judge.eval_data[0] img_flat = np.reshape(img, img.shape[0] * img.shape[1]) nonzero = np.where(img_flat > 0)[0] idx = np.random.choice(np.array(nonzero), 4) mask_flat = np.zeros_like(img_flat) mask_flat[idx] = 1 N_steps = 100 judge.train(N_steps) print(judge.evaluate_accuracy()) print( judge.evaluate_debate(np.stack((mask_flat, img_flat * mask_flat)), [0, 2]))
help="Number of training steps. If more than 1, apply to corresponding index of n-zero" ) parser.add_argument( "--n-zero", nargs="*", type=float, default=[0], help="Number of 0-pixels to sample" ) parser.add_argument( "--path", type=str, help="Path to save the trained judge to (and restore from)" ) args = parser.parse_args() path = args.path or "saved_models/" + args.dataset + str(args.N_to_mask) if args.dataset == "mnist": judge = MNISTJudge(N_to_mask=args.N_to_mask, model_dir=path) elif args.dataset == "fashion": judge = FashionJudge(N_to_mask=args.N_to_mask, model_dir=path) else: raise Exception("Unknown dataset " + args.dataset) n_zero = args.n_zero if len(args.n_zero) > 1 else args.n_zero[0] t = time.time() if args.train_steps == 0: print('Received 0 steps. Will not train.') else: judge.train(args.train_steps, n_zero) print('Time', time.time() - t) print('Accuracy', judge.evaluate_accuracy(n_zero)) for i in range(args.N_to_mask + 1): print('Accuracy', i, 'black pixels', judge.evaluate_accuracy(i))
def run( N_to_mask, judge_path, dataset, nmbr_samples, start_at_sample, eval_unrestricted, rollouts, index_of_truth_agent, changing_sides, compute_confusion_matrix, precom_eval_seeds, image_directory, N_images_to_save, allow_black_pixels, ): """ Evaluates debate on a given number of samples of a given dataset ("mnist", "fashion"). Each debate has N_to_mask rounds. The debate is either modeled with precommit or unrestricted given the eval_unrestricted parameter. The precommited debate is evaluated by the way described in the "AI safety via debate" paper, the unrestricted debate is played once for each sample. index_of_truth_agent: Either 0 or 1 whether the honest agent plays first or second. changing_sides: If set to True, agents switch sides after each move, if set to False, the first agents reveales N_to_mask/2 features followed by N_to_mask/2 features of the second agent compute_confusion_matrix: If True, compute confusion matrix as in figure 3 in the AISvD paper. Only for restricted debate. """ # Parse parameters if judge_path: path = judge_path elif dataset: path = "saved_models/" + dataset + str(N_to_mask) else: raise Exception("Either judge_path or dataset needs to be specified") if dataset == "mnist": judge = MNISTJudge(N_to_mask=N_to_mask, model_dir=path) elif dataset == "fashion": judge = FashionJudge(N_to_mask=N_to_mask, model_dir=path) else: raise Exception("Unknown dataset in " + "dataset.txt: " + dataset) if not nmbr_samples: nmbr_samples = len(judge.eval_data) if (precom_eval_seeds % 2) != 1: raise Exception( "Number of seeds to evaluate the precommited debate must be odd") assert image_directory or not N_images_to_save print("Parameters") print("--------") print("N_to_mask:", N_to_mask) print("judge_path:", judge_path) print("dataset:", dataset) print("nmbr_samples:", nmbr_samples) print("start_at_sample:", start_at_sample) print("eval_unrestricted:", eval_unrestricted) print("rollouts:", rollouts) print("index_of_truth_agent:", index_of_truth_agent) print("changing_sides:", changing_sides) print("compute_confusion_matrix:", compute_confusion_matrix) print("precom_eval_seeds:", precom_eval_seeds) print("image_directory:", image_directory) print("N_images_to_save:", N_images_to_save) print("allow_black_pixels:", allow_black_pixels) print("--------") judge_accuracy = judge.evaluate_accuracy() print("Judge accuracy:", judge_accuracy) print("--------", flush=True) # Prepare for confusion # confusion_matrix_counter[true_label][lying_label] specifies the number of times 'lying_label' won over 'true_label' # labels_frequency counts the occurancies of each label if compute_confusion_matrix: if eval_unrestricted: raise Exception( "Consusion matrix can be computed only while evaluating restricted debate" ) confusion_matrix_counter = np.array([[0 for i in range(10)] for i in range(10)], dtype='f') labels_frequency = [0 for i in range(10)] else: confusion_matrix_counter = None # Run debate for each sample overall_truth_win_count = 0 for sample_id in range(start_at_sample, start_at_sample + nmbr_samples): sample_start_time = time.time() sample = judge.eval_data[sample_id].flatten() label = judge.eval_labels[sample_id] save_img = image_directory and ( N_images_to_save is None or sample_id - start_at_sample < N_images_to_save) # Reproduce the experiment from AI safety via debate paper if not eval_unrestricted: if save_img: dirname = image_directory + '/img' + str(sample_id + 1) + '/' makedirs(dirname, exist_ok=True) else: dirname = None truth_won = evaluate_sample_restricted( N_to_mask, sample, label, judge, rollouts, index_of_truth_agent, changing_sides, allow_black_pixels, precom_eval_seeds, confusion_matrix_counter, dirname) if compute_confusion_matrix: labels_frequency[label] += 1 # Evaluate unrestricted debate (without precommit) else: if save_img: makedirs(image_directory, exist_ok=True) filename = image_directory + '/img' + str(sample_id + 1) else: filename = None truth_won = evaluate_sample_unrestricted(N_to_mask, sample, label, judge, rollouts, index_of_truth_agent, changing_sides, allow_black_pixels, filename) print("\t Sample {}".format(sample_id + 1), end=" ", flush=True) if truth_won: overall_truth_win_count += 1 print("Winner: Truth.", end=" ", flush=True) else: print("Winner: Liar.", end=" ", flush=True) print( "Truth winrate: {} out of {} ({}%)".format( overall_truth_win_count, sample_id - start_at_sample + 1, 100 * overall_truth_win_count / (sample_id - start_at_sample + 1), ), flush=True, ) print("\t Sample time: {}".format(time.time() - sample_start_time)) print( "Overall truth winrate: {} out of {} ({}%)".format( overall_truth_win_count, nmbr_samples, 100 * overall_truth_win_count / nmbr_samples, ), flush=True, ) if compute_confusion_matrix: build_confusion_matrix(confusion_matrix_counter, labels_frequency, dataset, show_matrix=False)
""" Minor script used for debugging. The purpose is to ensure the judge's 'predictor' and 'estimator' yield the same prediction accuracy (because they are supposed to contain the same model). """ from judge import MNISTJudge, FashionJudge if __name__ == "__main__": dataset = "mnist" judge_path = "judge_mnist_4" N_to_mask = 4 if judge_path: path = judge_path elif dataset: path = "saved_models/" + dataset + str(N_to_mask) else: raise Exception("Either judge_path or dataset needs to be specified") if dataset == "mnist": judge = MNISTJudge(N_to_mask=N_to_mask, model_dir=path) elif dataset == "fashion": judge = FashionJudge(N_to_mask=N_to_mask, model_dir=path) else: raise Exception("Unknown dataset in " + "dataset.txt: " + dataset) print("accuracy estimator", judge.evaluate_accuracy()) print("accuracy predictor", judge.evaluate_accuracy_using_predictor())
def run( N_to_mask, judge_path, dataset, rollouts, N_epochs, batch_size, learning_rate, learning_rate_decay, classifier_path, cheat_debate, only_update_for_wins, precomputed_debate_results_restricted_first_path, precomputed_debate_results_restricted_second_path, shuffle_batches, use_dropout, importance_sampling_weights, importance_sampling_cap, ): if judge_path: path = judge_path elif dataset: path = "saved_models/" + dataset + str(N_to_mask) else: raise Exception("Either judge_path or dataset needs to be specified") if dataset == "mnist": judge = MNISTJudge(N_to_mask=N_to_mask, model_dir=path) elif dataset == "fashion": judge = FashionJudge(N_to_mask=N_to_mask, model_dir=path) else: raise Exception("Unknown dataset in " + "dataset.txt: " + dataset) judge_accuracy = judge.evaluate_accuracy() print("Judge accuracy:", judge_accuracy) if precomputed_debate_results_restricted_first_path is not None: assert precomputed_debate_results_restricted_second_path is not None if cheat_debate: raise Exception( "cheat_debate should not be enabled when training " "from precomputed debate results" ) debate_results_restricted_first = np.fromfile( precomputed_debate_results_restricted_first_path ).reshape(-1, 10, 10) debate_results_restricted_second = np.fromfile( precomputed_debate_results_restricted_second_path ).reshape(-1, 10, 10) print( "Loaded debate results from {} and {}".format( precomputed_debate_results_restricted_first_path, precomputed_debate_results_restricted_second_path, ) ) print("These will be used for training instead of re-running the debates.") else: debate_results_restricted_first, debate_results_restricted_second = None, None train_data = judge.train_data N_train = len(judge.train_labels) eval_data = judge.eval_data eval_labels = judge.eval_labels debate_classifier = DebateClassifier( learning_rate=learning_rate, learning_rate_decay=learning_rate_decay, model_dir=classifier_path, use_dropout=use_dropout, ) batch_samples = [] batch_labels = [] batch_weights = [] t = time.time() for epoch in range(N_epochs): for i in range(N_train): # print(i, flush=True) sample = train_data[i] probs = next(debate_classifier.predict(sample))["probabilities"] label = np.random.choice(range(len(probs)), p=probs) restricted_first = np.random.random() < 0.5 if cheat_debate: # simulate a perfectly accurate debate if label == judge.train_labels[i]: weight = 1 elif only_update_for_wins: weight = 0 else: weight = -0.1 elif debate_results_restricted_first is not None: assert debate_results_restricted_second is not None if restricted_first: debate_results = debate_results_restricted_first else: debate_results = debate_results_restricted_second # use precomputed results judge_probabilities = debate_results[i, label] if np.all(judge_probabilities[label] >= judge_probabilities): weight = 1 elif only_update_for_wins: weight = 0 else: weight = -0.1 else: # run non-precommited debate agent_unrestricted = DebateAgent( precommit_label=None, agentStrength=rollouts ) agent_restricted = DebateAgent( precommit_label=label, agentStrength=rollouts ) if restricted_first: agent1, agent2 = agent_restricted, agent_unrestricted else: agent1, agent2 = agent_unrestricted, agent_restricted debate = Debate((agent1, agent2), judge, N_to_mask, sample.flat) utility = debate.play() if (utility == 1 and restricted_first) or ( utility == -1 and not restricted_first ): weight = 1 elif only_update_for_wins: weight = 0 else: weight = -0.1 if importance_sampling_weights: importance_sampling_factor = 1 / probs[label] if ( importance_sampling_cap is not None and importance_sampling_factor > importance_sampling_cap ): importance_sampling_factor = importance_sampling_cap weight *= importance_sampling_factor # print("weight", weight) batch_samples.append(sample) batch_labels.append(label) batch_weights.append(weight) if (i + 1) % batch_size == 0 or i == N_train - 1: # update debate classifier print("i", i, flush=True) print("batch_weights", batch_weights, flush=True) debate_classifier.train( np.array(batch_samples), np.array(batch_labels), np.array(batch_weights), shuffle=shuffle_batches, ) acc = debate_classifier.evaluate_accuracy(eval_data, eval_labels) print("Updated debate_classifier", flush=True) print("Evaluation accuracy", acc, flush=True) t2 = time.time() print("Batch time ", t2 - t) t = t2 batch_samples = [] batch_labels = [] batch_weights = [] acc = debate_classifier.evaluate_accuracy(eval_data, eval_labels) print("Accuracy", acc, flush=True)