Example #1
def test_multi_value_matmul_prop():
    args = """
    --config unit_test
    --layers-per-ipu 3 7 7 7
    --num-hidden-layers 24
    --matmul-proportion 0.15 0.3 0.3 0.3
    config = BertConfig(**(vars(parse_bert_args(args))))
    assert config.matmul_proportion == [0.15, 0.3, 0.3, 0.3]

    # Invalid inputs
    args = """
    --config unit_test
    --layers-per-ipu 3 7 7 7
    --num-hidden-layers 24
    --matmul-proportion 0.15 0.3 0.3
    with pytest.raises(SystemExit):
        config = BertConfig(**(vars(parse_bert_args(args))))

    args = """
    --config unit_test
    --layers-per-ipu 3 7 7 7
    --num-hidden-layers 24
    --matmul-proportion 0.15 0.3 0.3 0.3 0.3
    with pytest.raises(SystemExit):
        config = BertConfig(**(vars(parse_bert_args(args))))
Example #2
def test_invalid_layers_per_ipu():
    args = """
    --config unit_test
    --layers-per-ipu 1 1 1 1
    --num-hidden-layers 3
    with pytest.raises(SystemExit):
        config = BertConfig(**(vars(parse_bert_args(args))))

    args = """
    --config unit_test
    --layers-per-ipu 4
    --num-hidden-layers 3
    with pytest.raises(SystemExit):
        config = BertConfig(**(vars(parse_bert_args(args))))

    args = """
    --config unit_test
    --layers-per-ipu 0 1 2 1
    --num-hidden-layers 3
    with pytest.raises(SystemExit):
        config = BertConfig(**(vars(parse_bert_args(args))))

    args = """
    --config unit_test
    --layers-per-ipu 0 1 1 1 1
    --num-hidden-layers 3
    with pytest.raises(SystemExit):
        config = BertConfig(**(vars(parse_bert_args(args))))
Example #3
def test_get_layer_ipu():
    args = """
    --config unit_test
    --layers-per-ipu 2
    --num-hidden-layers 12
    config = BertConfig(**(vars(parse_bert_args(args))))
    assert (_get_layer_ipu(
        config.layers_per_ipu) == [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5])

    args = """
    --config unit_test
    --layers-per-ipu 2 2 2 2 2 1
    --num-hidden-layers 11
    config = BertConfig(**(vars(parse_bert_args(args))))
    assert (_get_layer_ipu(
        config.layers_per_ipu) == [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5])

    args = """
    --config unit_test
    --layers-per-ipu 0 1 1 1
    --num-hidden-layers 3
    config = BertConfig(**(vars(parse_bert_args(args))))
    assert (_get_layer_ipu(config.layers_per_ipu) == [1, 2, 3])
Example #4
def test_multi_value_layers_per_ipu():
    args = """
    --config unit_test
    --layers-per-ipu 1 2 3 4
    --num-hidden-layers 10
    config = BertConfig(**(vars(parse_bert_args(args))))
    assert config.layers_per_ipu == [1, 2, 3, 4]

    args = """
    --config unit_test
    --layers-per-ipu 0 3 3 4
    --num-hidden-layers 10
    config = BertConfig(**(vars(parse_bert_args(args))))
    assert config.layers_per_ipu == [0, 3, 3, 4]
Example #5
def main(arg_list=None):
    run_args, remaining_args = parse_args(arg_list)
    remaining_args += ["--checkpoint-dir", run_args.checkpoint_dir]

    # If no config is supplied, try to load the config that should have been saved with the ckpts.
    if "--config" not in remaining_args:
        config_path = find_checkpoint_config(run_args.checkpoint_dir)
        remaining_args += ["--config", config_path]

    bert_args = utils.parse_bert_args(remaining_args)
    if not run_args.no_logger_setup:

    # Force variable weights in inference mode - otherwise we can't override the model weights for
    # validating each new checkpoint.
    bert_args.variable_weights_inference = True
    # Required to allow squeezed models to fit.
    bert_args.max_copy_merge_size = 32000

    logger.info("Program Start")

    # `parse_bert_args` will suffix the user-supplied checkpoint path with the current date/time.
    # To avoid modifying core Bert code, we'll just remove the suffix (we don't need the created
    # config).
    bert_args.checkpoint_dir = os.path.dirname(bert_args.checkpoint_dir)

        f"Validating over checkpoints in directory {bert_args.checkpoint_dir}")
    return validate_checkpoints(run_args, utils.get_validation_args(bert_args))
Example #6
def test_single_value_layers_per_ipu():
    args = """
    --config unit_test
    --layers-per-ipu 1
    --num-hidden-layers 4
    config = BertConfig(**(vars(parse_bert_args(args))))
    assert config.layers_per_ipu == [1, 1, 1, 1]
Example #7
def test_single_value_matmul_prop():
    # Matmul proportion on all IPUs, not just encoder IPUs
    args = """
    --config unit_test
    --layers-per-ipu 1
    --num-hidden-layers 4
    --matmul-proportion 0.2
    config = BertConfig(**(vars(parse_bert_args(args))))
    assert config.matmul_proportion == [0.2, 0.2, 0.2, 0.2]
def test_bert_regression(custom_ops,
    Run a pretraining pass of BERT up to the specified number of epochs.

    This test will gather a number of statistics and assert that performance
    hasn't dropped substantially (with 10% leeway in some cases).

    `utils.run_py` only carries out a single step, we need to run multiple
    epochs to check accuracy, so this is based on the training example.
    # We'll try to create the output path straight-away so as not to waste
    # time if we get an error
    os.makedirs(output_path, exist_ok=True)

    args_string = [
        "--config", bert_config_file, "--no-validation", "--no-model-save"
    if synthetic:
        args_string += ["--epochs", str(synthetic_steps)]
        args_string += ["--aggregate-metrics-over-steps", str(synthetic_steps)]
    if extra_args is not None:
        args_string += extra_args
    args = parse_bert_args(args_string)
    session, iteration = main(args)

    # Graph report statistics
    graph_report = json.loads(session.getGraphReport())
    max_tile_memory = max(graph_report["memory"]["byTile"]["total"])
    total_memory = int(np.sum(graph_report["memory"]["byTile"]["total"]))

    baseline_result = get_test_baseline(uid)

    accuracies = get_accuracy_stats(args, iteration)

    # TODO: Add epochs_to_full back in.
    result = RegressionResult(args.input_files, baseline_result, accuracies,
                              total_memory, max_tile_memory,

    result.write(output_path, uid)

    # Could probably roll these into a single check for success, but for now
    # this will cause the reason of the failure to be line-highlighted
    assert (not result.status & ResultStatus.FAILED_ACCURACY)
    assert (not result.status & ResultStatus.FAILED_MEM_USAGE)
    assert (not result.status & ResultStatus.FAILED_TILE_MEM)
    assert (not result.status & ResultStatus.FAILED_THROUGHPUT)
def test_wikipedia_dataset():
    args = "--config demo_tiny_128".split()
    config = transformers.BertConfig(**(vars(parse_bert_args(args))))
    config.vocab_size = 30522
    config.input_files = ["data/wikipedia/128/wiki_000.tfrecord"]

    num_tokens = 0
    replacement_counts = Counter({"103": 0, "same": 0, "random": 0})

    dataset = get_dataset(config)
    opts = get_options(config)
    loader = DataLoader(opts,

    for datum in tqdm(loader):
        tokens, attn_mask, types, mask_lm_pos, labels, nsp = datum
        tokens = tokens.numpy()
        attn_mask = attn_mask.numpy()
        types = types.numpy()
        mask_lm_pos = mask_lm_pos.numpy()
        labels = labels.numpy()
        nsp = nsp.numpy()
        for b in range(config.batch_size):
            check_dimensions(config, tokens[b], attn_mask[b], types[b],
                             mask_lm_pos[b], labels[b], nsp[b])
            check_tokens(config, tokens[b], mask_lm_pos[b], labels[b])
            check_attention_mask(attn_mask[b], tokens[b])
            check_mask_lm_positions(config, mask_lm_pos[b])
            check_labels(config, tokens[b], mask_lm_pos[b], labels[b])

            replacement_counts += mask_type_count(tokens[b], mask_lm_pos[b],

            # Number of tokens, not including padding
            num_tokens += attn_mask[b, attn_mask[b] == 1].shape[0]

    # Test masked token proportions
    total = sum(replacement_counts.values())
    for k in replacement_counts:
        replacement_counts[k] /= total

    assert (0.79 < replacement_counts["103"] < 0.81)
    assert (0.09 < replacement_counts["same"] < 0.11)
    assert (0.09 < replacement_counts["random"] < 0.11)
    assert (0.14 < total / num_tokens < 0.16)  # should be ~0.15
Example #10
def test_host_embedding():
    args_string = [
        "--config", 'configs/squad_base_inference.json',
        '--host-embedding=ALL', '--synthetic-data=true'
    args = utils.parse_bert_args(args_string)
    args.shuffle = False
    args.host_embedding = "ALL"
    host_embedding_outputs = np.array(run_embedding_layer(args), dtype=float)
    args.host_embedding = "NONE"
    ipu_embedding_outputs = np.array(run_embedding_layer(args), dtype=float)

    if np.allclose(host_embedding_outputs, ipu_embedding_outputs, rtol=0.3):
        raise TestFailureError("outputs do not match")
Example #11
def test_host_embedding(custom_ops):
    args_string = [
        '--host-embedding=ALL', '--device-connection-type=ondemand',
    args = utils.parse_bert_args(args_string)
    args.shuffle = False
    args.host_embedding = "ALL"
    host_embedding_outputs = np.array(run_embedding_layer(args), dtype=float)
    args.host_embedding = "NONE"
    ipu_embedding_outputs = np.array(run_embedding_layer(args), dtype=float)

    if np.allclose(host_embedding_outputs, ipu_embedding_outputs, rtol=0.3):
        raise TestFailureError("outputs do not match")
Example #12
def main(arg_list=None):
    run_args, remaining_args = parse_args(arg_list)
    remaining_args += ["--checkpoint-dir", run_args.checkpoint_dir]

    bert_args = utils.parse_bert_args(remaining_args)
    if not run_args.no_logger_setup:

    logger.info("Program Start")

    # `parse_bert_args` will suffix the user-supplied checkpoint path with the current date/time.
    # To avoid modifying core Bert code, we'll just remove the suffix (we don't need the created
    # config).
    bert_args.checkpoint_dir = os.path.dirname(bert_args.checkpoint_dir)

        f"Fine-Tuning over checkpoints in directory {bert_args.checkpoint_dir}"
    finetune_checkpoints(run_args, bert_args)
Example #13
def test_constant_lrschedule():
    Test that lr schedule "constant" results in unchanging LR
    import warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    args = """
    --config unit_test
    --lr-schedule constant
    config = transformers.BertConfig(**(vars(parse_bert_args(args))))
    opts = get_options(config)

    # IPU Model and Optimizer
    model = PipelinedBertWithLoss(config).half().train()
    optimizer = get_optimizer(config, model)
    scheduler = get_lr_scheduler(optimizer, "constant")
    poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer)

    def mock_data():
        return get_generated_datum(config)

    # Compile the model

    # Starting lr should be 1.0
    assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][
        0] == config.learning_rate

    # Run for some steps
    for _ in range(5):
        outputs = poptorch_model(*mock_data())

    # LR should be unchanged
    assert poptorch_model._dict_new_optimizer["groups"][0]["learningRate"][
        0] == config.learning_rate
Example #14
def test_checkpoint_not_in_ir():
    import warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    # Config
    args = """
    --config unit_test
    --lr-schedule constant
    --layers-per-ipu 0 3
    --vocab-size 30400
    --weight-decay 0.0
    --recompute-checkpoint-every-layer False
    config = BertConfig(**(vars(parse_bert_args(args))))

    assert config.recompute_checkpoint_every_layer is False

    # Execution parameters
    opts = get_options(config)
    model = PipelinedBertWithLoss(config).half().train()
    optimizer = get_optimizer(config, model)
    poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer)

    # Compile model
    datum = get_generated_datum(config)
    ir = json.loads(poptorch_model._debugGetPopartIR())
    assert not any(["Checkpoint" in node["name"] for node in ir["maingraph"]
                    ]), ("Popart IR should contain a checkpoint")

    # Stash: 5 inputs, and 1 stash for transformers on ipu1
    exp_num_stash = 5 + 1
    assert sum([
        "Stash" in node["type"] for node in ir["maingraph"]
    ]) == exp_num_stash, ("Both the graph input and the checkpoint(s) "
                          "should be stashed")
    print(sum(["Stash" in node["type"] for node in ir["maingraph"]]))
Example #15
        format='%(asctime)s %(name)s %(levelname)s %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    # Define a specific Handler for this file that removes the root name.
    console = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s',
                                  '%Y-%m-%d %H:%M:%S')
    logger.propagate = False

if __name__ == "__main__":

    args = utils.parse_bert_args()


    logger.info("Program Start")
    logger.info("Hostname: " + socket.gethostname())
    logger.info("Command Executed: " + str(sys.argv))

    # Run the main inference/training session by default
    if args.inference or not args.no_training:

    # If this was a training session and validation isn't disabled; validate.
    if not args.inference and not args.no_validation and not args.no_model_save:
        logger.info("Doing Validation")
Example #16
def test_lrschedule_changes_lr():
    Test that pytorch LR scheduler is correctly changing the learning rate
    in poptorch
    import warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    # Get args and put in config
    args = """
    --config unit_test
    --lr-warmup 0.25
    --lr-schedule linear
    config = transformers.BertConfig(**(vars(parse_bert_args(args))))
    opts = get_options(config)

    # IPU Model and Optimizer
    model = PipelinedBertWithLoss(config).half().train()
    optimizer = get_optimizer(config, model)
    scheduler = get_lr_scheduler(optimizer, config.lr_schedule,
                                 config.lr_warmup, config.training_steps)
    poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer)

    def mock_data():
        return get_generated_datum(config)

    # Compile the model

    # Starting lr should be 0.0
    assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][
        0] == 0.0

    # Run for warmup+1 steps to get to peak
    warmup_steps = int(config.lr_warmup * config.training_steps)
    for _ in range(warmup_steps + 1):
        outputs = poptorch_model(*mock_data())

    # After warmup+1 steps LR should = 1.0
    assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][
        0] == config.learning_rate

    # run the remaining steps
    for _ in range(warmup_steps + 1, config.training_steps):
        outputs = poptorch_model(*mock_data())

    # LR should have decreased from the peak
    assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][
        0] < config.learning_rate
    assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][0] > 0.0

    # Running beyond the schedule sets lr=0.0
    for _ in range(config.training_steps, config.training_steps + 1):
        outputs = poptorch_model(*mock_data())
    assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][
        0] == 0.0
Example #17
from poptorch.enums import DataLoaderMode
from bert_data import get_dataset, get_generated_datum
from bert_model import PipelinedBertWithLoss
from bert_ipu import get_options
from bert_optimization import get_lr_scheduler, get_optimizer
from bert_checkpoint import save_checkpoint, restore_checkpoint, checkpoints_exist
from utils import parse_bert_args, cycle, logger

if __name__ == "__main__":

    # Ignore known warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    # Build config from args
    config = transformers.BertConfig(**(vars(parse_bert_args())))

    # Checkpoints should be saved to a directory with no existing checkpoints
    if config.checkpoint_dir and checkpoints_exist(config):
        raise RuntimeError(
            "Found previously saved checkpoint(s) at checkpoint-dir. "
            "Overwriting checkpoints is not supported. "
            "Please specify a different checkpoint-dir to "
            "save checkpoints from this run.")
    # Restore from checkpoint if necessary
    checkpoint = restore_checkpoint(config) if config.checkpoint_file else None

    # Execution parameters
    opts = get_options(config)

    # W&B
Example #18
def test_ipu_cpu_match(recompute_checkpoint, embedding_serialization):
    Test that the BERT model ran on IPU approximately matches that same
    model ran on the CPU.
    import warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    # Config
    args = """
    --config unit_test
    --lr-schedule constant
    --layers-per-ipu 0 3
    --vocab-size 30400
    --batch-size 10
    --batches-per-step 1
    --gradient-accumulation 10
    --enable-half-partials False
    --optimizer AdamW
    --learning-rate 0.001
    config = BertConfig(**(vars(parse_bert_args(args))))
    config.hidden_dropout_prob = 0.0
    config.attention_probs_dropout_prob = 0.0
    config.recompute_checkpoint_every_layer = recompute_checkpoint
    config.embedding_serialization = embedding_serialization

    # Models and options
    opts = get_options(config)
    model_cpu = PipelinedBertWithLoss(config).train()
    model_ipu = PipelinedBertWithLoss(config).train()

    # Check that copy was successful
    assert model_ipu is not model_cpu
    assert all([(a == b).all() for a, b in zip(
        model_cpu.parameters(), model_ipu.parameters())]) is True

    optimizer_cpu = torch.optim.AdamW(model_cpu.parameters(), lr=0.001)
    optimizer_ipu = poptorch.optim.AdamW(model_ipu.parameters(), lr=0.001, loss_scaling=1.0)
    poptorch_model = poptorch.trainingModel(model_ipu, opts, optimizer=optimizer_ipu)

    # Input
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer("Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo"
                       "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo"
                       "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo"
                       "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute", return_tensors="pt")
    inputs['labels'] = torch.randint(0, config.vocab_size, [1, config.mask_tokens], dtype=torch.long)
    inputs['next_sentence_label'] = torch.randint(0, 1, [1], dtype=torch.long)
    inputs['masked_lm_positions'] = torch.randint(0, config.sequence_length, [1, config.mask_tokens], dtype=torch.long)

    batch_size = config.batch_size

    batch = (inputs['input_ids'].repeat(batch_size, 1),
             inputs['attention_mask'].repeat(batch_size, 1),
             inputs['token_type_ids'].repeat(batch_size, 1),
             inputs['masked_lm_positions'].repeat(batch_size, 1),
             inputs['labels'].repeat(batch_size, 1),
             inputs['next_sentence_label'].repeat(batch_size, 1))

    batch_cpu = (inputs['input_ids'].repeat(1, 1),
                 inputs['attention_mask'].repeat(1, 1),
                 inputs['token_type_ids'].repeat(1, 1),
                 inputs['masked_lm_positions'].repeat(1, 1),
                 inputs['labels'].repeat(1, 1),
                 inputs['next_sentence_label'].repeat(1, 1))

    # Training Loop
    for step in range(10):
        # Step CPU model
        for b in range(batch_size):
            cpu_output = model_cpu(*batch_cpu)
            cpu_loss = cpu_output[0]

        # Step IPU Model
        ipu_output = poptorch_model(*batch)
        ipu_loss = ipu_output[0]

        with torch.no_grad():
            print(f"CPU Loss: {cpu_loss}, IPU Loss: {ipu_loss}")
            # Check the losses are approximately equal
            assert np.allclose(cpu_loss.numpy(), ipu_loss.numpy(), atol=1e-6)