def test_wikipedia_dataset():
    args = "--config demo_tiny_128".split()
    config = transformers.BertConfig(**(vars(parse_bert_args(args))))
    config.vocab_size = 30522
    config.input_files = ["data/wikipedia/128/wiki_000.tfrecord"]

    num_tokens = 0
    replacement_counts = Counter({"103": 0, "same": 0, "random": 0})

    dataset = get_dataset(config)
    opts = get_options(config)
    loader = DataLoader(opts,
                        dataset,
                        batch_size=config.batch_size,
                        num_workers=config.dataloader_workers)

    for datum in tqdm(loader):
        tokens, attn_mask, types, mask_lm_pos, labels, nsp = datum
        tokens = tokens.numpy()
        attn_mask = attn_mask.numpy()
        types = types.numpy()
        mask_lm_pos = mask_lm_pos.numpy()
        labels = labels.numpy()
        nsp = nsp.numpy()
        for b in range(config.batch_size):
            check_dimensions(config, tokens[b], attn_mask[b], types[b],
                             mask_lm_pos[b], labels[b], nsp[b])
            check_tokens(config, tokens[b], mask_lm_pos[b], labels[b])
            check_attention_mask(attn_mask[b], tokens[b])
            check_mask_lm_positions(config, mask_lm_pos[b])
            check_labels(config, tokens[b], mask_lm_pos[b], labels[b])
            check_token_type(types[b])
            check_nsp(nsp[b])

            replacement_counts += mask_type_count(tokens[b], mask_lm_pos[b],
                                                  labels[b])

            # Number of tokens, not including padding
            num_tokens += attn_mask[b, attn_mask[b] == 1].shape[0]

    # Test masked token proportions
    total = sum(replacement_counts.values())
    for k in replacement_counts:
        replacement_counts[k] /= total

    assert (0.79 < replacement_counts["103"] < 0.81)
    assert (0.09 < replacement_counts["same"] < 0.11)
    assert (0.09 < replacement_counts["random"] < 0.11)
    assert (0.14 < total / num_tokens < 0.16)  # should be ~0.15
def test_constant_lrschedule():
    """
    Test that lr schedule "constant" results in unchanging LR
    """
    import warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    args = """
    --config unit_test
    --lr-schedule constant
    """.split()
    config = transformers.BertConfig(**(vars(parse_bert_args(args))))
    opts = get_options(config)

    # IPU Model and Optimizer
    model = PipelinedBertWithLoss(config).half().train()
    optimizer = get_optimizer(config, model)
    scheduler = get_lr_scheduler(optimizer, "constant")
    poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer)

    def mock_data():
        return get_generated_datum(config)

    # Compile the model
    poptorch_model.compile(*mock_data())

    # Starting lr should be 1.0
    assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][
        0] == config.learning_rate

    # Run for some steps
    for _ in range(5):
        outputs = poptorch_model(*mock_data())
        scheduler.step()
        poptorch_model.setOptimizer(optimizer)

    # LR should be unchanged
    assert poptorch_model._dict_new_optimizer["groups"][0]["learningRate"][
        0] == config.learning_rate
Example #3
0
def test_checkpoint_not_in_ir():
    import warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    # Config
    args = """
    --config unit_test
    --lr-schedule constant
    --layers-per-ipu 0 3
    --vocab-size 30400
    --weight-decay 0.0
    --recompute-checkpoint-every-layer False
    """.split()
    config = BertConfig(**(vars(parse_bert_args(args))))

    assert config.recompute_checkpoint_every_layer is False

    # Execution parameters
    opts = get_options(config)
    model = PipelinedBertWithLoss(config).half().train()
    optimizer = get_optimizer(config, model)
    poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer)

    # Compile model
    datum = get_generated_datum(config)
    poptorch_model.compile(*datum)
    ir = json.loads(poptorch_model._debugGetPopartIR())
    assert not any(["Checkpoint" in node["name"] for node in ir["maingraph"]
                    ]), ("Popart IR should contain a checkpoint")

    # Stash: 5 inputs, and 1 stash for transformers on ipu1
    exp_num_stash = 5 + 1
    assert sum([
        "Stash" in node["type"] for node in ir["maingraph"]
    ]) == exp_num_stash, ("Both the graph input and the checkpoint(s) "
                          "should be stashed")
    print(sum(["Stash" in node["type"] for node in ir["maingraph"]]))
Example #4
0
    # Build config from args
    config = transformers.BertConfig(**(vars(parse_bert_args())))

    # Checkpoints should be saved to a directory with no existing checkpoints
    if config.checkpoint_dir and checkpoints_exist(config):
        raise RuntimeError(
            "Found previously saved checkpoint(s) at checkpoint-dir. "
            "Overwriting checkpoints is not supported. "
            "Please specify a different checkpoint-dir to "
            "save checkpoints from this run.")
    # Restore from checkpoint if necessary
    checkpoint = restore_checkpoint(config) if config.checkpoint_file else None

    # Execution parameters
    opts = get_options(config)

    # W&B
    if config.wandb:
        wandb.init(project="torch-bert")
        wandb.config.update(vars(config))

    # Dataset selection
    dataset = get_dataset(config)

    # Dataloader
    logger("------------------- Data Loading Started ------------------")
    start_loading = time.perf_counter()
    loader = DataLoader(opts,
                        dataset,
                        batch_size=config.batch_size,
def test_lrschedule_changes_lr():
    """
    Test that pytorch LR scheduler is correctly changing the learning rate
    in poptorch
    """
    import warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    # Get args and put in config
    args = """
    --config unit_test
    --lr-warmup 0.25
    --lr-schedule linear
    """.split()
    config = transformers.BertConfig(**(vars(parse_bert_args(args))))
    opts = get_options(config)

    # IPU Model and Optimizer
    model = PipelinedBertWithLoss(config).half().train()
    optimizer = get_optimizer(config, model)
    scheduler = get_lr_scheduler(optimizer, config.lr_schedule,
                                 config.lr_warmup, config.training_steps)
    poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer)

    def mock_data():
        return get_generated_datum(config)

    # Compile the model
    poptorch_model.compile(*mock_data())

    # Starting lr should be 0.0
    assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][
        0] == 0.0

    # Run for warmup+1 steps to get to peak
    warmup_steps = int(config.lr_warmup * config.training_steps)
    for _ in range(warmup_steps + 1):
        outputs = poptorch_model(*mock_data())
        scheduler.step()
        poptorch_model.setOptimizer(optimizer)

    # After warmup+1 steps LR should = 1.0
    assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][
        0] == config.learning_rate

    # run the remaining steps
    for _ in range(warmup_steps + 1, config.training_steps):
        outputs = poptorch_model(*mock_data())
        scheduler.step()
        poptorch_model.setOptimizer(optimizer)

    # LR should have decreased from the peak
    assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][
        0] < config.learning_rate
    assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][0] > 0.0

    # Running beyond the schedule sets lr=0.0
    for _ in range(config.training_steps, config.training_steps + 1):
        outputs = poptorch_model(*mock_data())
        scheduler.step()
        poptorch_model.setOptimizer(optimizer)
    assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][
        0] == 0.0
Example #6
0
def test_ipu_cpu_match(recompute_checkpoint, embedding_serialization):
    """
    Test that the BERT model ran on IPU approximately matches that same
    model ran on the CPU.
    """
    import warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    # Config
    args = """
    --config unit_test
    --lr-schedule constant
    --layers-per-ipu 0 3
    --vocab-size 30400
    --batch-size 10
    --batches-per-step 1
    --gradient-accumulation 10
    --enable-half-partials False
    --optimizer AdamW
    --learning-rate 0.001
    """.split()
    config = BertConfig(**(vars(parse_bert_args(args))))
    config.hidden_dropout_prob = 0.0
    config.attention_probs_dropout_prob = 0.0
    config.recompute_checkpoint_every_layer = recompute_checkpoint
    config.embedding_serialization = embedding_serialization

    # Models and options
    opts = get_options(config)
    opts.anchorMode(poptorch.AnchorMode.Final)
    model_cpu = PipelinedBertWithLoss(config).train()
    model_ipu = PipelinedBertWithLoss(config).train()
    model_ipu.load_state_dict(model_cpu.state_dict())

    # Check that copy was successful
    assert model_ipu is not model_cpu
    assert all([(a == b).all() for a, b in zip(
        model_cpu.parameters(), model_ipu.parameters())]) is True

    optimizer_cpu = torch.optim.AdamW(model_cpu.parameters(), lr=0.001)
    optimizer_ipu = poptorch.optim.AdamW(model_ipu.parameters(), lr=0.001, loss_scaling=1.0)
    poptorch_model = poptorch.trainingModel(model_ipu, opts, optimizer=optimizer_ipu)

    # Input
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer("Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo"
                       "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo"
                       "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo"
                       "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute", return_tensors="pt")
    inputs['labels'] = torch.randint(0, config.vocab_size, [1, config.mask_tokens], dtype=torch.long)
    inputs['next_sentence_label'] = torch.randint(0, 1, [1], dtype=torch.long)
    inputs['masked_lm_positions'] = torch.randint(0, config.sequence_length, [1, config.mask_tokens], dtype=torch.long)

    batch_size = config.batch_size

    batch = (inputs['input_ids'].repeat(batch_size, 1),
             inputs['attention_mask'].repeat(batch_size, 1),
             inputs['token_type_ids'].repeat(batch_size, 1),
             inputs['masked_lm_positions'].repeat(batch_size, 1),
             inputs['labels'].repeat(batch_size, 1),
             inputs['next_sentence_label'].repeat(batch_size, 1))

    batch_cpu = (inputs['input_ids'].repeat(1, 1),
                 inputs['attention_mask'].repeat(1, 1),
                 inputs['token_type_ids'].repeat(1, 1),
                 inputs['masked_lm_positions'].repeat(1, 1),
                 inputs['labels'].repeat(1, 1),
                 inputs['next_sentence_label'].repeat(1, 1))

    # Training Loop
    for step in range(10):
        # Step CPU model
        optimizer_cpu.zero_grad()
        for b in range(batch_size):
            cpu_output = model_cpu(*batch_cpu)
            cpu_loss = cpu_output[0]
            cpu_loss.div(batch_size).backward()
        optimizer_cpu.step()

        # Step IPU Model
        ipu_output = poptorch_model(*batch)
        ipu_loss = ipu_output[0]

        with torch.no_grad():
            print(f"CPU Loss: {cpu_loss}, IPU Loss: {ipu_loss}")
            # Check the losses are approximately equal
            assert np.allclose(cpu_loss.numpy(), ipu_loss.numpy(), atol=1e-6)