Esempio n. 1
def test_checkpoint_recompute_checkpoint(recompute_checkpoint):
    If a checkpoint is saved with `recompute_checkpoint_every_layer`
      then we should be able to restore the checkpoint in a new run
      that doesn't use `recompute_checkpoint_every_layer` and vice-verse.
    args = """
    --config unit_test
    config1 = BertConfig(**(vars(parse_bert_args(args))))
    config1.recompute_checkpoint_every_layer = recompute_checkpoint
    model1 = PipelinedBertForPretraining(config1).parallelize()

    with tempfile.TemporaryDirectory() as dir:
        # Save checkpoint
        config1.checkpoint_output_dir = dir
        save_checkpoint(config1, model1, 0)

        # New model with opposite `recompute_checkpoint` to model1
        config2 = BertConfig(**(vars(parse_bert_args(args))))
        config2.recompute_checkpoint_every_layer = not recompute_checkpoint
        model2 = PipelinedBertForPretraining.from_pretrained(os.path.join(dir, "step_0"), config=config2).parallelize()

        # Models should now have the same weights
        for name, tensor1 in model1.state_dict().items():
            tensor2 = model2.state_dict()[name]
            assert torch.allclose(tensor1, tensor2)
Esempio n. 2
def test_checkpoint_save_restore(recompute_checkpoint, embedding_serialization_factor):
    Test that saving and restoring checkpoints works. Also test checkpointing
    with recomputation checkpoints and embedding serialization.
    args = """
    --config unit_test
    config = BertConfig(**(vars(parse_bert_args(args))))
    config.recompute_checkpoint_every_layer = recompute_checkpoint
    config.embedding_serialization_factor = embedding_serialization_factor
    model1 = PipelinedBertForPretraining(config).parallelize()
    model2 = PipelinedBertForPretraining(config).parallelize()

    # The two models should have different initial weights
    for name, tensor1 in model1.state_dict().items():
        tensor2 = model2.state_dict()[name]
        if (tensor1.dtype is not torch.int64) and ("LayerNorm" not in name) and ("bias" not in name):
            assert not torch.allclose(tensor1, tensor2)

    # Save and restore checkpoint
    with tempfile.TemporaryDirectory() as dir:
        config.checkpoint_output_dir = dir
        # No checkpoints should exist yet
        assert not checkpoints_exist(config.checkpoint_output_dir)

        save_checkpoint(config, model1, 0)

        # Checkpoint should now exist
        assert checkpoints_exist(config.checkpoint_output_dir)

        # Restore from checkpoint
        model2 = PipelinedBertForPretraining.from_pretrained(os.path.join(dir, "step_0"), config=config)

        # Models should now have the same weights
        for name, tensor1 in model1.state_dict().items():
            tensor2 = model2.state_dict()[name]
            assert torch.allclose(tensor1, tensor2)
Esempio n. 3
def test_ipu_cpu_match(recompute_checkpoint, embedding_serialization):
    Test that the BERT model ran on IPU approximately matches that same
    model ran on the CPU.
    import warnings
    warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)

    # Config
    args = """
    --config unit_test
    --lr-schedule constant
    --layers-per-ipu 0 3
    --vocab-size 30400
    --batch-size 10
    --batches-per-step 1
    --gradient-accumulation 10
    --enable-half-partials False
    --optimizer AdamW
    --learning-rate 0.001
    config = BertConfig(**(vars(parse_bert_args(args))))
    config.hidden_dropout_prob = 0.0
    config.attention_probs_dropout_prob = 0.0
    config.recompute_checkpoint_every_layer = recompute_checkpoint
    config.embedding_serialization = embedding_serialization

    # Models and options
    opts = get_options(config)
    model_cpu = PipelinedBertWithLoss(config).train()
    model_ipu = PipelinedBertWithLoss(config).train()

    # Check that copy was successful
    assert model_ipu is not model_cpu
    assert all([(a == b).all() for a, b in zip(
        model_cpu.parameters(), model_ipu.parameters())]) is True

    optimizer_cpu = torch.optim.AdamW(model_cpu.parameters(), lr=0.001)
    optimizer_ipu = poptorch.optim.AdamW(model_ipu.parameters(), lr=0.001, loss_scaling=1.0)
    poptorch_model = poptorch.trainingModel(model_ipu, opts, optimizer=optimizer_ipu)

    # Input
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer("Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo"
                       "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo"
                       "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo"
                       "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute", return_tensors="pt")
    inputs['labels'] = torch.randint(0, config.vocab_size, [1, config.mask_tokens], dtype=torch.long)
    inputs['next_sentence_label'] = torch.randint(0, 1, [1], dtype=torch.long)
    inputs['masked_lm_positions'] = torch.randint(0, config.sequence_length, [1, config.mask_tokens], dtype=torch.long)

    batch_size = config.batch_size

    batch = (inputs['input_ids'].repeat(batch_size, 1),
             inputs['attention_mask'].repeat(batch_size, 1),
             inputs['token_type_ids'].repeat(batch_size, 1),
             inputs['masked_lm_positions'].repeat(batch_size, 1),
             inputs['labels'].repeat(batch_size, 1),
             inputs['next_sentence_label'].repeat(batch_size, 1))

    batch_cpu = (inputs['input_ids'].repeat(1, 1),
                 inputs['attention_mask'].repeat(1, 1),
                 inputs['token_type_ids'].repeat(1, 1),
                 inputs['masked_lm_positions'].repeat(1, 1),
                 inputs['labels'].repeat(1, 1),
                 inputs['next_sentence_label'].repeat(1, 1))

    # Training Loop
    for step in range(10):
        # Step CPU model
        for b in range(batch_size):
            cpu_output = model_cpu(*batch_cpu)
            cpu_loss = cpu_output[0]

        # Step IPU Model
        ipu_output = poptorch_model(*batch)
        ipu_loss = ipu_output[0]

        with torch.no_grad():
            print(f"CPU Loss: {cpu_loss}, IPU Loss: {ipu_loss}")
            # Check the losses are approximately equal
            assert np.allclose(cpu_loss.numpy(), ipu_loss.numpy(), atol=1e-6)