Python MNISTJudge Examples, judge.MNISTJudge Python Examples

Example #1

0

Show file

def get_debate_results(
    start_point,
    use_test_data,
    batch_size,
    N_samples,
    N_to_mask,
    judge_path,
    restricted_first,
):
    # MNISTJudge has to be imported here, because otherwise tensorflow does not
    # work together with multiprocessing
    from judge import MNISTJudge

    judge = MNISTJudge(N_to_mask=N_to_mask,
                       model_dir=judge_path,
                       binary_rewards=False)
    if use_test_data:
        dataset = judge.eval_data
    else:
        dataset = judge.train_data

    result_list = []
    for i in range(batch_size):
        print("i", start_point + i, flush=True)
        t = time.time()
        if start_point + i > dataset.shape[0]:  # end of dataset
            break
        results_per_label = np.zeros([10, 10])
        for label in range(10):
            # print("label", label)
            sample = dataset[start_point + i]
            unrestricted_agent = DebateAgent(precommit_label=None,
                                             agentStrength=args.rollouts)
            restricted_agent = DebateAgent(precommit_label=label,
                                           agentStrength=args.rollouts)
            if restricted_first:
                agent1, agent2 = unrestricted_agent, restricted_agent
            else:
                agent1, agent2 = restricted_agent, unrestricted_agent
            debate = Debate((agent1, agent2), judge, N_to_mask, sample.flat)
            probabilities = debate.play(full_report=True)
            results_per_label[label] = probabilities
        result_list.append(results_per_label)
        print("time", time.time() - t)
    return result_list

Example #2

0

Show file

def run(
    N_to_mask,
    sample_id,
    lying_agent_label,
    judge_path,
    dataset,
    rollouts,
    index_of_true_agent,
    binary_rewards,
    changing_sides,
):
    """
    Runs one debate game on a given sample from a specified dataset (either "mnist" or "fashion") with N_to_mask rounds.
    The debate is modeled by MCTS with given number of rollouts.
    One player is the honest agents arguing for the correct label, the other either precommits to a lying_agent_label or debates without precommit if lying_agent_label is set to None.

    index_of_true_agent: Either 0 or 1 whether the honest agent plays first or second.
    binary_rewards: If set to True, rewards are either 1 or -1, if set False rewards lie between -1 and 1
    changing_sides: If set to True, agents switch sides after each round, if set to False, the first agents reveales N_to_mask/2 features followed by N_to_mask/2 features of the second agent
    """
    # Parse parameters
    if judge_path:
        path = judge_path
    elif dataset:
        path = "saved_models/" + dataset + str(N_to_mask)
    else:
        raise Exception("dataset must be specified")

    if dataset == "mnist":
        judge = MNISTJudge(
            N_to_mask=N_to_mask, model_dir=path, binary_rewards=binary_rewards
        )
    elif dataset == "fashion":
        judge = FashionJudge(
            N_to_mask=N_to_mask, model_dir=path, binary_rewards=binary_rewards
        )
    else:
        raise Exception("Unknown dataset in " + "dataset.txt: " + dataset)

    # Get debate data and run the debate
    sample = judge.eval_data[sample_id].flatten()

    label = judge.eval_labels[sample_id]

    if lying_agent_label == label:
        raise Exception("Lying label cannot be the same as the correct one")

    agent_lie = DebateAgent(precommit_label=lying_agent_label, agentStrength=rollouts)
    agent_truth = DebateAgent(precommit_label=label, agentStrength=rollouts)
    assert index_of_true_agent in [0, 1]

    player_description = DebatePlayers(
        agent_truth, agent_lie, index_of_true_agent, our_name="truth", opp_name="liar"
    )

    debate = Debate(
        player_description.agents,
        judge,
        N_to_mask,
        sample,
        debug=True,
        changing_sides=changing_sides,
    )
    utility = debate.play()
    player_description.print_debate_result(utility, label)

Example #3

0

Show file

File: test_judge.py Project: david-lindner/ai-safety-debate

"""
Used for testing the MNIST judge. Trains a judge for a (hardcoded) number of steps
and prints out its accuracy.
"""

import numpy as np
from judge import MNISTJudge

if __name__ == "__main__":
    judge = MNISTJudge(4)
    img = judge.eval_data[0]
    img_flat = np.reshape(img, img.shape[0] * img.shape[1])
    nonzero = np.where(img_flat > 0)[0]
    idx = np.random.choice(np.array(nonzero), 4)
    mask_flat = np.zeros_like(img_flat)
    mask_flat[idx] = 1
    N_steps = 100
    judge.train(N_steps)
    print(judge.evaluate_accuracy())
    print(
        judge.evaluate_debate(np.stack((mask_flat, img_flat * mask_flat)),
                              [0, 2]))

Example #4

0

Show file

        "--train-steps",type=int, 
        help="Number of training steps. If more than 1, apply to corresponding index of n-zero"
    )
    parser.add_argument(
        "--n-zero", nargs="*", type=float, default=[0],
        help="Number of 0-pixels to sample"
    )
    parser.add_argument(
        "--path", type=str, help="Path to save the trained judge to (and restore from)"
    )
    args = parser.parse_args()

    path = args.path or "saved_models/" + args.dataset + str(args.N_to_mask)

    if args.dataset == "mnist":
        judge = MNISTJudge(N_to_mask=args.N_to_mask, model_dir=path)
    elif args.dataset == "fashion":
        judge = FashionJudge(N_to_mask=args.N_to_mask, model_dir=path)
    else:
        raise Exception("Unknown dataset " + args.dataset)

    n_zero = args.n_zero if len(args.n_zero) > 1 else args.n_zero[0]
    t = time.time()
    if args.train_steps == 0:
        print('Received 0 steps. Will not train.')
    else:
        judge.train(args.train_steps, n_zero)
    print('Time', time.time() - t)
    print('Accuracy', judge.evaluate_accuracy(n_zero))
    for i in range(args.N_to_mask + 1):
        print('Accuracy', i, 'black pixels', judge.evaluate_accuracy(i))

Example #5

0

Show file

from judge import MNISTJudge

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--N-to-mask",
        type=int,
        help="Number of features revealed as an input",
        required=True,
    )
    parser.add_argument(
        "--file", type=str, required=True, help="File containing debate results"
    )
    args = parser.parse_args()

    judge = MNISTJudge(N_to_mask=args.N_to_mask)
    train_labels = judge.train_labels
    truth_wins_count, restricted_wins_count = 0, 0

    debate_results = np.fromfile(args.file).reshape(-1, 10, 10)
    n_samples = debate_results.shape[0]
    for i in range(n_samples):
        true_label = train_labels[i]
        for label in range(10):
            judge_probabilities = debate_results[i, label]
            if np.all(judge_probabilities[label] >= judge_probabilities):
                restricted_wins_count += 1
                if label == true_label:
                    truth_wins_count += 1

    print("\n-------------------------------")

Example #6

0

Show file

"""
Minor script used for debugging.

The purpose is to ensure the judge's 'predictor' and 'estimator' yield the same
prediction accuracy (because they are supposed to contain the same model).
"""

from judge import MNISTJudge, FashionJudge

if __name__ == "__main__":
    dataset = "mnist"
    judge_path = "judge_mnist_4"
    N_to_mask = 4

    if judge_path:
        path = judge_path
    elif dataset:
        path = "saved_models/" + dataset + str(N_to_mask)
    else:
        raise Exception("Either judge_path or dataset needs to be specified")

    if dataset == "mnist":
        judge = MNISTJudge(N_to_mask=N_to_mask, model_dir=path)
    elif dataset == "fashion":
        judge = FashionJudge(N_to_mask=N_to_mask, model_dir=path)
    else:
        raise Exception("Unknown dataset in " + "dataset.txt: " + dataset)

    print("accuracy estimator", judge.evaluate_accuracy())
    print("accuracy predictor", judge.evaluate_accuracy_using_predictor())

Example #7

0

Show file

def run(
    N_to_mask,
    judge_path,
    dataset,
    nmbr_samples,
    start_at_sample,
    eval_unrestricted,
    rollouts,
    index_of_truth_agent,
    changing_sides,
    compute_confusion_matrix,
    precom_eval_seeds,
    image_directory,
    N_images_to_save,
    allow_black_pixels,
):
    """
    Evaluates debate on a given number of samples of a given dataset ("mnist", "fashion").
    Each debate has N_to_mask rounds.

    The debate is either modeled with precommit or unrestricted given the eval_unrestricted parameter.
    The precommited debate is evaluated by the way described in the "AI safety via debate" paper,
    the unrestricted debate is played once for each sample.

    index_of_truth_agent: Either 0 or 1 whether the honest agent plays first or second.
    changing_sides: If set to True, agents switch sides after each move, if set to False, the first agents reveales N_to_mask/2 features followed by N_to_mask/2 features of the second agent
    compute_confusion_matrix: If True, compute confusion matrix as in figure 3 in the AISvD paper. Only for restricted debate.

    """
    # Parse parameters
    if judge_path:
        path = judge_path
    elif dataset:
        path = "saved_models/" + dataset + str(N_to_mask)
    else:
        raise Exception("Either judge_path or dataset needs to be specified")

    if dataset == "mnist":
        judge = MNISTJudge(N_to_mask=N_to_mask, model_dir=path)
    elif dataset == "fashion":
        judge = FashionJudge(N_to_mask=N_to_mask, model_dir=path)
    else:
        raise Exception("Unknown dataset in " + "dataset.txt: " + dataset)

    if not nmbr_samples:
        nmbr_samples = len(judge.eval_data)

    if (precom_eval_seeds % 2) != 1:
        raise Exception(
            "Number of seeds to evaluate the precommited debate must be odd")

    assert image_directory or not N_images_to_save

    print("Parameters")
    print("--------")
    print("N_to_mask:", N_to_mask)
    print("judge_path:", judge_path)
    print("dataset:", dataset)
    print("nmbr_samples:", nmbr_samples)
    print("start_at_sample:", start_at_sample)
    print("eval_unrestricted:", eval_unrestricted)
    print("rollouts:", rollouts)
    print("index_of_truth_agent:", index_of_truth_agent)
    print("changing_sides:", changing_sides)
    print("compute_confusion_matrix:", compute_confusion_matrix)
    print("precom_eval_seeds:", precom_eval_seeds)
    print("image_directory:", image_directory)
    print("N_images_to_save:", N_images_to_save)
    print("allow_black_pixels:", allow_black_pixels)
    print("--------")
    judge_accuracy = judge.evaluate_accuracy()
    print("Judge accuracy:", judge_accuracy)
    print("--------", flush=True)

    # Prepare for confusion
    # confusion_matrix_counter[true_label][lying_label] specifies the number of times 'lying_label' won over 'true_label'
    # labels_frequency counts the occurancies of each label
    if compute_confusion_matrix:
        if eval_unrestricted:
            raise Exception(
                "Consusion matrix can be computed only while evaluating restricted debate"
            )
        confusion_matrix_counter = np.array([[0 for i in range(10)]
                                             for i in range(10)],
                                            dtype='f')
        labels_frequency = [0 for i in range(10)]
    else:
        confusion_matrix_counter = None

    # Run debate for each sample
    overall_truth_win_count = 0
    for sample_id in range(start_at_sample, start_at_sample + nmbr_samples):
        sample_start_time = time.time()
        sample = judge.eval_data[sample_id].flatten()
        label = judge.eval_labels[sample_id]

        save_img = image_directory and (
            N_images_to_save is None
            or sample_id - start_at_sample < N_images_to_save)
        # Reproduce the experiment from AI safety via debate paper
        if not eval_unrestricted:
            if save_img:
                dirname = image_directory + '/img' + str(sample_id + 1) + '/'
                makedirs(dirname, exist_ok=True)
            else:
                dirname = None
            truth_won = evaluate_sample_restricted(
                N_to_mask, sample, label, judge, rollouts,
                index_of_truth_agent, changing_sides, allow_black_pixels,
                precom_eval_seeds, confusion_matrix_counter, dirname)
            if compute_confusion_matrix:
                labels_frequency[label] += 1

        # Evaluate unrestricted debate (without precommit)
        else:
            if save_img:
                makedirs(image_directory, exist_ok=True)
                filename = image_directory + '/img' + str(sample_id + 1)
            else:
                filename = None
            truth_won = evaluate_sample_unrestricted(N_to_mask, sample, label,
                                                     judge, rollouts,
                                                     index_of_truth_agent,
                                                     changing_sides,
                                                     allow_black_pixels,
                                                     filename)

        print("\t Sample {}".format(sample_id + 1), end=" ", flush=True)
        if truth_won:
            overall_truth_win_count += 1
            print("Winner: Truth.", end=" ", flush=True)
        else:
            print("Winner: Liar.", end=" ", flush=True)
        print(
            "Truth winrate: {} out of {} ({}%)".format(
                overall_truth_win_count,
                sample_id - start_at_sample + 1,
                100 * overall_truth_win_count /
                (sample_id - start_at_sample + 1),
            ),
            flush=True,
        )
        print("\t  Sample time: {}".format(time.time() - sample_start_time))

    print(
        "Overall truth winrate: {} out of {} ({}%)".format(
            overall_truth_win_count,
            nmbr_samples,
            100 * overall_truth_win_count / nmbr_samples,
        ),
        flush=True,
    )

    if compute_confusion_matrix:
        build_confusion_matrix(confusion_matrix_counter,
                               labels_frequency,
                               dataset,
                               show_matrix=False)

Example #8

0

Show file

File: test_debate_classifier.py Project: david-lindner/ai-safety-debate

"""
Used for testing the debate classifier. Trains it on MNIST with the true labels
and evaluates it's accuracy. Only used for testing.
"""

import numpy as np

from judge import MNISTJudge
from agent import DebateClassifier

if __name__ == "__main__":
    judge = MNISTJudge(4)
    train_data = judge.train_data
    train_labels = judge.train_labels
    eval_data = judge.eval_data
    eval_labels = judge.eval_labels

    debate_classifier = DebateClassifier()
    for i in range(100):
        batch_start = (i * 128) % len(train_data)
        batch_end = min(batch_start + 128, len(train_data))
        batch = train_data[batch_start:batch_end]
        labels = train_labels[batch_start:batch_end]
        loss_weights = np.ones_like(labels, dtype=np.float32)
        debate_classifier.train(batch, labels, loss_weights)
    acc = debate_classifier.evaluate_accuracy(eval_data, eval_labels)
    print("Accuracy", acc)

Example #9

0

Show file

def run(
    N_to_mask,
    judge_path,
    dataset,
    rollouts,
    N_epochs,
    batch_size,
    learning_rate,
    learning_rate_decay,
    classifier_path,
    cheat_debate,
    only_update_for_wins,
    precomputed_debate_results_restricted_first_path,
    precomputed_debate_results_restricted_second_path,
    shuffle_batches,
    use_dropout,
    importance_sampling_weights,
    importance_sampling_cap,
):
    if judge_path:
        path = judge_path
    elif dataset:
        path = "saved_models/" + dataset + str(N_to_mask)
    else:
        raise Exception("Either judge_path or dataset needs to be specified")

    if dataset == "mnist":
        judge = MNISTJudge(N_to_mask=N_to_mask, model_dir=path)
    elif dataset == "fashion":
        judge = FashionJudge(N_to_mask=N_to_mask, model_dir=path)
    else:
        raise Exception("Unknown dataset in " + "dataset.txt: " + dataset)

    judge_accuracy = judge.evaluate_accuracy()
    print("Judge accuracy:", judge_accuracy)

    if precomputed_debate_results_restricted_first_path is not None:
        assert precomputed_debate_results_restricted_second_path is not None
        if cheat_debate:
            raise Exception(
                "cheat_debate should not be enabled when training "
                "from precomputed debate results"
            )
        debate_results_restricted_first = np.fromfile(
            precomputed_debate_results_restricted_first_path
        ).reshape(-1, 10, 10)
        debate_results_restricted_second = np.fromfile(
            precomputed_debate_results_restricted_second_path
        ).reshape(-1, 10, 10)
        print(
            "Loaded debate results from {} and {}".format(
                precomputed_debate_results_restricted_first_path,
                precomputed_debate_results_restricted_second_path,
            )
        )
        print("These will be used for training instead of re-running the debates.")
    else:
        debate_results_restricted_first, debate_results_restricted_second = None, None

    train_data = judge.train_data
    N_train = len(judge.train_labels)
    eval_data = judge.eval_data
    eval_labels = judge.eval_labels

    debate_classifier = DebateClassifier(
        learning_rate=learning_rate,
        learning_rate_decay=learning_rate_decay,
        model_dir=classifier_path,
        use_dropout=use_dropout,
    )

    batch_samples = []
    batch_labels = []
    batch_weights = []

    t = time.time()

    for epoch in range(N_epochs):
        for i in range(N_train):
            # print(i, flush=True)
            sample = train_data[i]
            probs = next(debate_classifier.predict(sample))["probabilities"]
            label = np.random.choice(range(len(probs)), p=probs)
            restricted_first = np.random.random() < 0.5

            if cheat_debate:
                # simulate a perfectly accurate debate
                if label == judge.train_labels[i]:
                    weight = 1
                elif only_update_for_wins:
                    weight = 0
                else:
                    weight = -0.1
            elif debate_results_restricted_first is not None:
                assert debate_results_restricted_second is not None
                if restricted_first:
                    debate_results = debate_results_restricted_first
                else:
                    debate_results = debate_results_restricted_second
                # use precomputed results
                judge_probabilities = debate_results[i, label]
                if np.all(judge_probabilities[label] >= judge_probabilities):
                    weight = 1
                elif only_update_for_wins:
                    weight = 0
                else:
                    weight = -0.1
            else:
                # run non-precommited debate
                agent_unrestricted = DebateAgent(
                    precommit_label=None, agentStrength=rollouts
                )
                agent_restricted = DebateAgent(
                    precommit_label=label, agentStrength=rollouts
                )
                if restricted_first:
                    agent1, agent2 = agent_restricted, agent_unrestricted
                else:
                    agent1, agent2 = agent_unrestricted, agent_restricted
                debate = Debate((agent1, agent2), judge, N_to_mask, sample.flat)
                utility = debate.play()

                if (utility == 1 and restricted_first) or (
                    utility == -1 and not restricted_first
                ):
                    weight = 1
                elif only_update_for_wins:
                    weight = 0
                else:
                    weight = -0.1

            if importance_sampling_weights:
                importance_sampling_factor = 1 / probs[label]
                if (
                    importance_sampling_cap is not None
                    and importance_sampling_factor > importance_sampling_cap
                ):
                    importance_sampling_factor = importance_sampling_cap
                weight *= importance_sampling_factor

            # print("weight", weight)
            batch_samples.append(sample)
            batch_labels.append(label)
            batch_weights.append(weight)

            if (i + 1) % batch_size == 0 or i == N_train - 1:
                # update debate classifier
                print("i", i, flush=True)
                print("batch_weights", batch_weights, flush=True)
                debate_classifier.train(
                    np.array(batch_samples),
                    np.array(batch_labels),
                    np.array(batch_weights),
                    shuffle=shuffle_batches,
                )
                acc = debate_classifier.evaluate_accuracy(eval_data, eval_labels)
                print("Updated debate_classifier", flush=True)
                print("Evaluation accuracy", acc, flush=True)
                t2 = time.time()
                print("Batch time ", t2 - t)
                t = t2
                batch_samples = []
                batch_labels = []
                batch_weights = []

    acc = debate_classifier.evaluate_accuracy(eval_data, eval_labels)
    print("Accuracy", acc, flush=True)

Example #10

0

Show file

def test_mnist_judge():
    # dummy test, would make sense to write more at some point
    judge = MNISTJudge(N_to_mask=700)
    assert judge.N_to_mask == 700