"""
Used for testing the MNIST judge. Trains a judge for a (hardcoded) number of steps
and prints out its accuracy.
"""

import numpy as np
from judge import MNISTJudge

if __name__ == "__main__":
    judge = MNISTJudge(4)
    img = judge.eval_data[0]
    img_flat = np.reshape(img, img.shape[0] * img.shape[1])
    nonzero = np.where(img_flat > 0)[0]
    idx = np.random.choice(np.array(nonzero), 4)
    mask_flat = np.zeros_like(img_flat)
    mask_flat[idx] = 1
    N_steps = 100
    judge.train(N_steps)
    print(judge.evaluate_accuracy())
    print(
        judge.evaluate_debate(np.stack((mask_flat, img_flat * mask_flat)),
                              [0, 2]))
Example #2
0
        help="Number of training steps. If more than 1, apply to corresponding index of n-zero"
    )
    parser.add_argument(
        "--n-zero", nargs="*", type=float, default=[0],
        help="Number of 0-pixels to sample"
    )
    parser.add_argument(
        "--path", type=str, help="Path to save the trained judge to (and restore from)"
    )
    args = parser.parse_args()

    path = args.path or "saved_models/" + args.dataset + str(args.N_to_mask)

    if args.dataset == "mnist":
        judge = MNISTJudge(N_to_mask=args.N_to_mask, model_dir=path)
    elif args.dataset == "fashion":
        judge = FashionJudge(N_to_mask=args.N_to_mask, model_dir=path)
    else:
        raise Exception("Unknown dataset " + args.dataset)

    n_zero = args.n_zero if len(args.n_zero) > 1 else args.n_zero[0]
    t = time.time()
    if args.train_steps == 0:
        print('Received 0 steps. Will not train.')
    else:
        judge.train(args.train_steps, n_zero)
    print('Time', time.time() - t)
    print('Accuracy', judge.evaluate_accuracy(n_zero))
    for i in range(args.N_to_mask + 1):
        print('Accuracy', i, 'black pixels', judge.evaluate_accuracy(i))
Example #3
0
def run(
    N_to_mask,
    judge_path,
    dataset,
    nmbr_samples,
    start_at_sample,
    eval_unrestricted,
    rollouts,
    index_of_truth_agent,
    changing_sides,
    compute_confusion_matrix,
    precom_eval_seeds,
    image_directory,
    N_images_to_save,
    allow_black_pixels,
):
    """
    Evaluates debate on a given number of samples of a given dataset ("mnist", "fashion").
    Each debate has N_to_mask rounds.

    The debate is either modeled with precommit or unrestricted given the eval_unrestricted parameter.
    The precommited debate is evaluated by the way described in the "AI safety via debate" paper,
    the unrestricted debate is played once for each sample.

    index_of_truth_agent: Either 0 or 1 whether the honest agent plays first or second.
    changing_sides: If set to True, agents switch sides after each move, if set to False, the first agents reveales N_to_mask/2 features followed by N_to_mask/2 features of the second agent
    compute_confusion_matrix: If True, compute confusion matrix as in figure 3 in the AISvD paper. Only for restricted debate.

    """
    # Parse parameters
    if judge_path:
        path = judge_path
    elif dataset:
        path = "saved_models/" + dataset + str(N_to_mask)
    else:
        raise Exception("Either judge_path or dataset needs to be specified")

    if dataset == "mnist":
        judge = MNISTJudge(N_to_mask=N_to_mask, model_dir=path)
    elif dataset == "fashion":
        judge = FashionJudge(N_to_mask=N_to_mask, model_dir=path)
    else:
        raise Exception("Unknown dataset in " + "dataset.txt: " + dataset)

    if not nmbr_samples:
        nmbr_samples = len(judge.eval_data)

    if (precom_eval_seeds % 2) != 1:
        raise Exception(
            "Number of seeds to evaluate the precommited debate must be odd")

    assert image_directory or not N_images_to_save

    print("Parameters")
    print("--------")
    print("N_to_mask:", N_to_mask)
    print("judge_path:", judge_path)
    print("dataset:", dataset)
    print("nmbr_samples:", nmbr_samples)
    print("start_at_sample:", start_at_sample)
    print("eval_unrestricted:", eval_unrestricted)
    print("rollouts:", rollouts)
    print("index_of_truth_agent:", index_of_truth_agent)
    print("changing_sides:", changing_sides)
    print("compute_confusion_matrix:", compute_confusion_matrix)
    print("precom_eval_seeds:", precom_eval_seeds)
    print("image_directory:", image_directory)
    print("N_images_to_save:", N_images_to_save)
    print("allow_black_pixels:", allow_black_pixels)
    print("--------")
    judge_accuracy = judge.evaluate_accuracy()
    print("Judge accuracy:", judge_accuracy)
    print("--------", flush=True)

    # Prepare for confusion
    # confusion_matrix_counter[true_label][lying_label] specifies the number of times 'lying_label' won over 'true_label'
    # labels_frequency counts the occurancies of each label
    if compute_confusion_matrix:
        if eval_unrestricted:
            raise Exception(
                "Consusion matrix can be computed only while evaluating restricted debate"
            )
        confusion_matrix_counter = np.array([[0 for i in range(10)]
                                             for i in range(10)],
                                            dtype='f')
        labels_frequency = [0 for i in range(10)]
    else:
        confusion_matrix_counter = None

    # Run debate for each sample
    overall_truth_win_count = 0
    for sample_id in range(start_at_sample, start_at_sample + nmbr_samples):
        sample_start_time = time.time()
        sample = judge.eval_data[sample_id].flatten()
        label = judge.eval_labels[sample_id]

        save_img = image_directory and (
            N_images_to_save is None
            or sample_id - start_at_sample < N_images_to_save)
        # Reproduce the experiment from AI safety via debate paper
        if not eval_unrestricted:
            if save_img:
                dirname = image_directory + '/img' + str(sample_id + 1) + '/'
                makedirs(dirname, exist_ok=True)
            else:
                dirname = None
            truth_won = evaluate_sample_restricted(
                N_to_mask, sample, label, judge, rollouts,
                index_of_truth_agent, changing_sides, allow_black_pixels,
                precom_eval_seeds, confusion_matrix_counter, dirname)
            if compute_confusion_matrix:
                labels_frequency[label] += 1

        # Evaluate unrestricted debate (without precommit)
        else:
            if save_img:
                makedirs(image_directory, exist_ok=True)
                filename = image_directory + '/img' + str(sample_id + 1)
            else:
                filename = None
            truth_won = evaluate_sample_unrestricted(N_to_mask, sample, label,
                                                     judge, rollouts,
                                                     index_of_truth_agent,
                                                     changing_sides,
                                                     allow_black_pixels,
                                                     filename)

        print("\t Sample {}".format(sample_id + 1), end=" ", flush=True)
        if truth_won:
            overall_truth_win_count += 1
            print("Winner: Truth.", end=" ", flush=True)
        else:
            print("Winner: Liar.", end=" ", flush=True)
        print(
            "Truth winrate: {} out of {} ({}%)".format(
                overall_truth_win_count,
                sample_id - start_at_sample + 1,
                100 * overall_truth_win_count /
                (sample_id - start_at_sample + 1),
            ),
            flush=True,
        )
        print("\t  Sample time: {}".format(time.time() - sample_start_time))

    print(
        "Overall truth winrate: {} out of {} ({}%)".format(
            overall_truth_win_count,
            nmbr_samples,
            100 * overall_truth_win_count / nmbr_samples,
        ),
        flush=True,
    )

    if compute_confusion_matrix:
        build_confusion_matrix(confusion_matrix_counter,
                               labels_frequency,
                               dataset,
                               show_matrix=False)
Example #4
0
"""
Minor script used for debugging.

The purpose is to ensure the judge's 'predictor' and 'estimator' yield the same
prediction accuracy (because they are supposed to contain the same model).
"""

from judge import MNISTJudge, FashionJudge

if __name__ == "__main__":
    dataset = "mnist"
    judge_path = "judge_mnist_4"
    N_to_mask = 4

    if judge_path:
        path = judge_path
    elif dataset:
        path = "saved_models/" + dataset + str(N_to_mask)
    else:
        raise Exception("Either judge_path or dataset needs to be specified")

    if dataset == "mnist":
        judge = MNISTJudge(N_to_mask=N_to_mask, model_dir=path)
    elif dataset == "fashion":
        judge = FashionJudge(N_to_mask=N_to_mask, model_dir=path)
    else:
        raise Exception("Unknown dataset in " + "dataset.txt: " + dataset)

    print("accuracy estimator", judge.evaluate_accuracy())
    print("accuracy predictor", judge.evaluate_accuracy_using_predictor())
Example #5
0
def run(
    N_to_mask,
    judge_path,
    dataset,
    rollouts,
    N_epochs,
    batch_size,
    learning_rate,
    learning_rate_decay,
    classifier_path,
    cheat_debate,
    only_update_for_wins,
    precomputed_debate_results_restricted_first_path,
    precomputed_debate_results_restricted_second_path,
    shuffle_batches,
    use_dropout,
    importance_sampling_weights,
    importance_sampling_cap,
):
    if judge_path:
        path = judge_path
    elif dataset:
        path = "saved_models/" + dataset + str(N_to_mask)
    else:
        raise Exception("Either judge_path or dataset needs to be specified")

    if dataset == "mnist":
        judge = MNISTJudge(N_to_mask=N_to_mask, model_dir=path)
    elif dataset == "fashion":
        judge = FashionJudge(N_to_mask=N_to_mask, model_dir=path)
    else:
        raise Exception("Unknown dataset in " + "dataset.txt: " + dataset)

    judge_accuracy = judge.evaluate_accuracy()
    print("Judge accuracy:", judge_accuracy)

    if precomputed_debate_results_restricted_first_path is not None:
        assert precomputed_debate_results_restricted_second_path is not None
        if cheat_debate:
            raise Exception(
                "cheat_debate should not be enabled when training "
                "from precomputed debate results"
            )
        debate_results_restricted_first = np.fromfile(
            precomputed_debate_results_restricted_first_path
        ).reshape(-1, 10, 10)
        debate_results_restricted_second = np.fromfile(
            precomputed_debate_results_restricted_second_path
        ).reshape(-1, 10, 10)
        print(
            "Loaded debate results from {} and {}".format(
                precomputed_debate_results_restricted_first_path,
                precomputed_debate_results_restricted_second_path,
            )
        )
        print("These will be used for training instead of re-running the debates.")
    else:
        debate_results_restricted_first, debate_results_restricted_second = None, None

    train_data = judge.train_data
    N_train = len(judge.train_labels)
    eval_data = judge.eval_data
    eval_labels = judge.eval_labels

    debate_classifier = DebateClassifier(
        learning_rate=learning_rate,
        learning_rate_decay=learning_rate_decay,
        model_dir=classifier_path,
        use_dropout=use_dropout,
    )

    batch_samples = []
    batch_labels = []
    batch_weights = []

    t = time.time()

    for epoch in range(N_epochs):
        for i in range(N_train):
            # print(i, flush=True)
            sample = train_data[i]
            probs = next(debate_classifier.predict(sample))["probabilities"]
            label = np.random.choice(range(len(probs)), p=probs)
            restricted_first = np.random.random() < 0.5

            if cheat_debate:
                # simulate a perfectly accurate debate
                if label == judge.train_labels[i]:
                    weight = 1
                elif only_update_for_wins:
                    weight = 0
                else:
                    weight = -0.1
            elif debate_results_restricted_first is not None:
                assert debate_results_restricted_second is not None
                if restricted_first:
                    debate_results = debate_results_restricted_first
                else:
                    debate_results = debate_results_restricted_second
                # use precomputed results
                judge_probabilities = debate_results[i, label]
                if np.all(judge_probabilities[label] >= judge_probabilities):
                    weight = 1
                elif only_update_for_wins:
                    weight = 0
                else:
                    weight = -0.1
            else:
                # run non-precommited debate
                agent_unrestricted = DebateAgent(
                    precommit_label=None, agentStrength=rollouts
                )
                agent_restricted = DebateAgent(
                    precommit_label=label, agentStrength=rollouts
                )
                if restricted_first:
                    agent1, agent2 = agent_restricted, agent_unrestricted
                else:
                    agent1, agent2 = agent_unrestricted, agent_restricted
                debate = Debate((agent1, agent2), judge, N_to_mask, sample.flat)
                utility = debate.play()

                if (utility == 1 and restricted_first) or (
                    utility == -1 and not restricted_first
                ):
                    weight = 1
                elif only_update_for_wins:
                    weight = 0
                else:
                    weight = -0.1

            if importance_sampling_weights:
                importance_sampling_factor = 1 / probs[label]
                if (
                    importance_sampling_cap is not None
                    and importance_sampling_factor > importance_sampling_cap
                ):
                    importance_sampling_factor = importance_sampling_cap
                weight *= importance_sampling_factor

            # print("weight", weight)
            batch_samples.append(sample)
            batch_labels.append(label)
            batch_weights.append(weight)

            if (i + 1) % batch_size == 0 or i == N_train - 1:
                # update debate classifier
                print("i", i, flush=True)
                print("batch_weights", batch_weights, flush=True)
                debate_classifier.train(
                    np.array(batch_samples),
                    np.array(batch_labels),
                    np.array(batch_weights),
                    shuffle=shuffle_batches,
                )
                acc = debate_classifier.evaluate_accuracy(eval_data, eval_labels)
                print("Updated debate_classifier", flush=True)
                print("Evaluation accuracy", acc, flush=True)
                t2 = time.time()
                print("Batch time ", t2 - t)
                t = t2
                batch_samples = []
                batch_labels = []
                batch_weights = []

    acc = debate_classifier.evaluate_accuracy(eval_data, eval_labels)
    print("Accuracy", acc, flush=True)