def test_wikipedia_dataset(): args = "--config demo_tiny_128".split() config = transformers.BertConfig(**(vars(parse_bert_args(args)))) config.vocab_size = 30522 config.input_files = ["data/wikipedia/128/wiki_000.tfrecord"] num_tokens = 0 replacement_counts = Counter({"103": 0, "same": 0, "random": 0}) dataset = get_dataset(config) opts = get_options(config) loader = DataLoader(opts, dataset, batch_size=config.batch_size, num_workers=config.dataloader_workers) for datum in tqdm(loader): tokens, attn_mask, types, mask_lm_pos, labels, nsp = datum tokens = tokens.numpy() attn_mask = attn_mask.numpy() types = types.numpy() mask_lm_pos = mask_lm_pos.numpy() labels = labels.numpy() nsp = nsp.numpy() for b in range(config.batch_size): check_dimensions(config, tokens[b], attn_mask[b], types[b], mask_lm_pos[b], labels[b], nsp[b]) check_tokens(config, tokens[b], mask_lm_pos[b], labels[b]) check_attention_mask(attn_mask[b], tokens[b]) check_mask_lm_positions(config, mask_lm_pos[b]) check_labels(config, tokens[b], mask_lm_pos[b], labels[b]) check_token_type(types[b]) check_nsp(nsp[b]) replacement_counts += mask_type_count(tokens[b], mask_lm_pos[b], labels[b]) # Number of tokens, not including padding num_tokens += attn_mask[b, attn_mask[b] == 1].shape[0] # Test masked token proportions total = sum(replacement_counts.values()) for k in replacement_counts: replacement_counts[k] /= total assert (0.79 < replacement_counts["103"] < 0.81) assert (0.09 < replacement_counts["same"] < 0.11) assert (0.09 < replacement_counts["random"] < 0.11) assert (0.14 < total / num_tokens < 0.16) # should be ~0.15
def test_constant_lrschedule(): """ Test that lr schedule "constant" results in unchanging LR """ import warnings warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) args = """ --config unit_test --lr-schedule constant """.split() config = transformers.BertConfig(**(vars(parse_bert_args(args)))) opts = get_options(config) # IPU Model and Optimizer model = PipelinedBertWithLoss(config).half().train() optimizer = get_optimizer(config, model) scheduler = get_lr_scheduler(optimizer, "constant") poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer) def mock_data(): return get_generated_datum(config) # Compile the model poptorch_model.compile(*mock_data()) # Starting lr should be 1.0 assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][ 0] == config.learning_rate # Run for some steps for _ in range(5): outputs = poptorch_model(*mock_data()) scheduler.step() poptorch_model.setOptimizer(optimizer) # LR should be unchanged assert poptorch_model._dict_new_optimizer["groups"][0]["learningRate"][ 0] == config.learning_rate
def test_checkpoint_not_in_ir(): import warnings warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) # Config args = """ --config unit_test --lr-schedule constant --layers-per-ipu 0 3 --vocab-size 30400 --weight-decay 0.0 --recompute-checkpoint-every-layer False """.split() config = BertConfig(**(vars(parse_bert_args(args)))) assert config.recompute_checkpoint_every_layer is False # Execution parameters opts = get_options(config) model = PipelinedBertWithLoss(config).half().train() optimizer = get_optimizer(config, model) poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer) # Compile model datum = get_generated_datum(config) poptorch_model.compile(*datum) ir = json.loads(poptorch_model._debugGetPopartIR()) assert not any(["Checkpoint" in node["name"] for node in ir["maingraph"] ]), ("Popart IR should contain a checkpoint") # Stash: 5 inputs, and 1 stash for transformers on ipu1 exp_num_stash = 5 + 1 assert sum([ "Stash" in node["type"] for node in ir["maingraph"] ]) == exp_num_stash, ("Both the graph input and the checkpoint(s) " "should be stashed") print(sum(["Stash" in node["type"] for node in ir["maingraph"]]))
# Build config from args config = transformers.BertConfig(**(vars(parse_bert_args()))) # Checkpoints should be saved to a directory with no existing checkpoints if config.checkpoint_dir and checkpoints_exist(config): raise RuntimeError( "Found previously saved checkpoint(s) at checkpoint-dir. " "Overwriting checkpoints is not supported. " "Please specify a different checkpoint-dir to " "save checkpoints from this run.") # Restore from checkpoint if necessary checkpoint = restore_checkpoint(config) if config.checkpoint_file else None # Execution parameters opts = get_options(config) # W&B if config.wandb: wandb.init(project="torch-bert") wandb.config.update(vars(config)) # Dataset selection dataset = get_dataset(config) # Dataloader logger("------------------- Data Loading Started ------------------") start_loading = time.perf_counter() loader = DataLoader(opts, dataset, batch_size=config.batch_size,
def test_lrschedule_changes_lr(): """ Test that pytorch LR scheduler is correctly changing the learning rate in poptorch """ import warnings warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) # Get args and put in config args = """ --config unit_test --lr-warmup 0.25 --lr-schedule linear """.split() config = transformers.BertConfig(**(vars(parse_bert_args(args)))) opts = get_options(config) # IPU Model and Optimizer model = PipelinedBertWithLoss(config).half().train() optimizer = get_optimizer(config, model) scheduler = get_lr_scheduler(optimizer, config.lr_schedule, config.lr_warmup, config.training_steps) poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer) def mock_data(): return get_generated_datum(config) # Compile the model poptorch_model.compile(*mock_data()) # Starting lr should be 0.0 assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][ 0] == 0.0 # Run for warmup+1 steps to get to peak warmup_steps = int(config.lr_warmup * config.training_steps) for _ in range(warmup_steps + 1): outputs = poptorch_model(*mock_data()) scheduler.step() poptorch_model.setOptimizer(optimizer) # After warmup+1 steps LR should = 1.0 assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][ 0] == config.learning_rate # run the remaining steps for _ in range(warmup_steps + 1, config.training_steps): outputs = poptorch_model(*mock_data()) scheduler.step() poptorch_model.setOptimizer(optimizer) # LR should have decreased from the peak assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][ 0] < config.learning_rate assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][0] > 0.0 # Running beyond the schedule sets lr=0.0 for _ in range(config.training_steps, config.training_steps + 1): outputs = poptorch_model(*mock_data()) scheduler.step() poptorch_model.setOptimizer(optimizer) assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][ 0] == 0.0
def test_ipu_cpu_match(recompute_checkpoint, embedding_serialization): """ Test that the BERT model ran on IPU approximately matches that same model ran on the CPU. """ import warnings warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) # Config args = """ --config unit_test --lr-schedule constant --layers-per-ipu 0 3 --vocab-size 30400 --batch-size 10 --batches-per-step 1 --gradient-accumulation 10 --enable-half-partials False --optimizer AdamW --learning-rate 0.001 """.split() config = BertConfig(**(vars(parse_bert_args(args)))) config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 config.recompute_checkpoint_every_layer = recompute_checkpoint config.embedding_serialization = embedding_serialization # Models and options opts = get_options(config) opts.anchorMode(poptorch.AnchorMode.Final) model_cpu = PipelinedBertWithLoss(config).train() model_ipu = PipelinedBertWithLoss(config).train() model_ipu.load_state_dict(model_cpu.state_dict()) # Check that copy was successful assert model_ipu is not model_cpu assert all([(a == b).all() for a, b in zip( model_cpu.parameters(), model_ipu.parameters())]) is True optimizer_cpu = torch.optim.AdamW(model_cpu.parameters(), lr=0.001) optimizer_ipu = poptorch.optim.AdamW(model_ipu.parameters(), lr=0.001, loss_scaling=1.0) poptorch_model = poptorch.trainingModel(model_ipu, opts, optimizer=optimizer_ipu) # Input tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') inputs = tokenizer("Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo" "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo" "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute yo" "Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute Hello, my dog is cute", return_tensors="pt") inputs['labels'] = torch.randint(0, config.vocab_size, [1, config.mask_tokens], dtype=torch.long) inputs['next_sentence_label'] = torch.randint(0, 1, [1], dtype=torch.long) inputs['masked_lm_positions'] = torch.randint(0, config.sequence_length, [1, config.mask_tokens], dtype=torch.long) batch_size = config.batch_size batch = (inputs['input_ids'].repeat(batch_size, 1), inputs['attention_mask'].repeat(batch_size, 1), inputs['token_type_ids'].repeat(batch_size, 1), inputs['masked_lm_positions'].repeat(batch_size, 1), inputs['labels'].repeat(batch_size, 1), inputs['next_sentence_label'].repeat(batch_size, 1)) batch_cpu = (inputs['input_ids'].repeat(1, 1), inputs['attention_mask'].repeat(1, 1), inputs['token_type_ids'].repeat(1, 1), inputs['masked_lm_positions'].repeat(1, 1), inputs['labels'].repeat(1, 1), inputs['next_sentence_label'].repeat(1, 1)) # Training Loop for step in range(10): # Step CPU model optimizer_cpu.zero_grad() for b in range(batch_size): cpu_output = model_cpu(*batch_cpu) cpu_loss = cpu_output[0] cpu_loss.div(batch_size).backward() optimizer_cpu.step() # Step IPU Model ipu_output = poptorch_model(*batch) ipu_loss = ipu_output[0] with torch.no_grad(): print(f"CPU Loss: {cpu_loss}, IPU Loss: {ipu_loss}") # Check the losses are approximately equal assert np.allclose(cpu_loss.numpy(), ipu_loss.numpy(), atol=1e-6)