def popart_result_and_model(popart_config, is_bwd=False): builder = popart.Builder() popart_model = Bert(popart_config, builder=builder) input_info = popart.TensorInfo(popart_config.popart_dtype, [ popart_config.batch_size * popart_config.sequence_length, popart_config.hidden_size ]) input_tensor = builder.addInputTensor(input_info) data = { input_tensor: np.random.normal(0, 0.02, input_info.shape()).astype(popart_config.dtype) } output = popart_model.feed_forward(input_tensor) proto = builder.getModelProto() if is_bwd: l1_lambda = 0.1 l1 = popart.L1Loss(output, "l1LossVal", l1_lambda) optimizer = popart.ConstSGD(0.01) outputs, post_proto = run_py(proto, data, (output, l1.output(0)), loss=l1, optimizer=optimizer) else: outputs, post_proto = run_py(proto, data, output) return data[input_tensor], outputs, proto, post_proto
def load_model_from_tf( file_path, is_checkpoint, config, indices, positions, segments, task, builder=popart.Builder(), ): """ Loads weights, etc. from Tensorflow files into the Graphcore IPU BERT implementation. Can read either checkpoint files, or frozen graphs, according to the `is_checkpoint` flag, passed in as the second argument. Requires input tensors to be provided to initialise the graph build. The user can optionally pass in a builder object (e.g. for compatibility with an older ONNX version). If not provided, a default builder is created. """ initializers = load_initializers_from_tf(file_path, is_checkpoint, config, task) popart_model = Bert(config, builder=builder, initializers=initializers) output_tensor = popart_model.build_graph(indices, positions, segments) proto = builder.getModelProto() return popart_model, proto, output_tensor
def test_positional_encoding_data(position_length, hidden_size): if not tf.executing_eagerly(): tf.enable_eager_execution() assert (tf.executing_eagerly()) builder = popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }) config = BertConfig(vocab_length=9728, batch_size=1, hidden_size=hidden_size, max_positional_length=position_length, sequence_length=128, activation_type='relu', popart_dtype="FLOAT", no_dropout=True, positional_embedding_init_fn="TRANSFORMER", inference=True) popart_model = Bert(config, builder=builder) shape = (config.max_positional_length, config.hidden_size) pos_pa = popart_model.generate_transformer_periodic_pos_data( config.dtype, shape) pos_tf = get_position_encoding_tf(shape[0], shape[1]).numpy() # Tensorflow broadcast multiplication seems to produce slightly different results # to numpy, hence the higher than expected error. The embeddings do correlate well # between the two despite this. assert (np.all(np.abs(pos_tf - pos_pa) < 5e-5))
def test_simplified_position_encoding(position_length, hidden_size): builder = popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }) config = BertConfig(vocab_length=9728, batch_size=1, hidden_size=hidden_size, max_positional_length=position_length, sequence_length=128, activation_type='relu', popart_dtype="FLOAT", no_dropout=True, positional_embedding_init_fn="SIMPLIFIED", inference=True) popart_model = Bert(config, builder=builder) shape = (config.max_positional_length, config.hidden_size) pa_data = popart_model.generate_simplified_periodic_pos_data( config.dtype, shape) bb_data = simplified_generator(position_length, hidden_size) assert (np.all(np.abs(bb_data - pa_data) < 1e-8))
def popart_result_and_model(popart_config, weight_decay=0, lr=0, l1_lambda=0): builder = popart.Builder() popart_model = Bert(popart_config, builder=builder) input_info = popart.TensorInfo(popart_config.popart_dtype, [ popart_config.batch_size * popart_config.sequence_length, popart_config.hidden_size ]) input_tensor = builder.addInputTensor(input_info) data = { input_tensor: np.random.normal(0, 0.02, input_info.shape()).astype(popart_config.dtype) } output = popart_model.feed_forward(input_tensor) proto = builder.getModelProto() l1 = popart.L1Loss(output, "l1LossVal", l1_lambda) iteration = MockIteration() args = MockArgs(lr, weight_decay) optimizer_factory = BaseOptimizerFactory(args, iteration, popart_model.tensors) optimizer = optimizer_factory.create() outputs, post_proto = run_py(proto, data, (output, l1.output(0)), loss=l1, optimizer=optimizer) return data[input_tensor], outputs, proto, post_proto
def create_dataset(args): # a simple copy of main bert.py until the dataset creation config = BertConfig() model = Bert(config, builder=popart.Builder()) indices, positions, segments, masks, labels = bert_add_inputs( args, model) inputs = [indices, positions, segments, masks, labels] embedding_dict, positional_dict = model.get_model_embeddings() shapeOf = model.builder.getTensorShape inputs = reduce(chain, inputs[3:], inputs[:3]) tensor_shapes = [(tensorId, shapeOf(tensorId)) for tensorId in inputs] dataset = get_bert_dataset(tensor_shapes, input_file=args.input_files, output_dir=args.output_dir, sequence_length=args.sequence_length, vocab_file=args.vocab_file, vocab_length=args.vocab_length, batch_size=args.batch_size, batches_per_step=args.batches_per_step, embedding_dict=embedding_dict, positional_dict=positional_dict, generated_data=args.generated_data, is_training=False, no_drop_remainder=True, shuffle=args.shuffle, mpi_size=args.mpi_size, is_distributed=(args.mpi_size > 1)) return dataset
def training_run(bert_args, config, initializers, checkpoint_paths): logger.info("Building Model") model = Bert(config, builder=popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }), initializers=initializers, execution_mode=bert_args.execution_mode) indices, positions, segments, masks, labels = bert_add_inputs( bert_args, model) logits = bert_logits_graph(model, indices, positions, segments, masks, bert_args.execution_mode) predictions, probs = bert_infer_graph(model, logits) losses = bert_loss_graph(model, probs, labels) outputs = bert_add_validation_outputs(model, predictions, losses) embedding_dict, positional_dict = model.get_model_embeddings() dataset = get_bert_dataset(model, bert_args, [indices, positions, segments, masks, labels], embedding_dict, positional_dict) data_flow = popart.DataFlow(dataset.batches_per_step, outputs) request_ipus, _ = calc_required_ipus(bert_args, model) device = acquire_device(bert_args, request_ipus) logger.info(f"Dataset length: {len(dataset)}") writer = bert_writer(bert_args) iteration = Iteration( bert_args, batches_per_step=dataset.batches_per_step, steps_per_epoch=len(dataset), writer=writer, recording_steps=bert_args.aggregate_metrics_over_steps) optimizer_factory = ScheduledOptimizerFactory(bert_args, iteration, "SGD", model.tensors) session, anchors = bert_training_session(model, bert_args, data_flow, losses, device, optimizer_factory) for path in checkpoint_paths: ckpt_name = os.path.splitext(os.path.basename(path))[0] session.resetHostWeights(os.path.abspath(path)) session.weightsFromHost() logger.info(f"Fine-tuning started for checkpoint: {path}") run_fine_tuning_store_ckpt(bert_args, model, ckpt_name, session, dataset, predictions, losses, labels, anchors) device.detach()
def get_model_proto(config, initializers=None): model = Bert(config, pipeline=True, initializers=initializers) sequence_info = popart.TensorInfo("UINT32", [config.micro_batch_size * config.sequence_length]) indices = model.builder.addInputTensor(sequence_info) positions = model.builder.addInputTensor(sequence_info) segments = model.builder.addInputTensor(sequence_info) output = model.build_graph(indices, positions, segments) return onnx.load_model_from_string(model.builder.getModelProto())
def create_dataset(args): # a simple copy of main bert.py until the dataset creation config = BertConfig() model = Bert(config, builder=popart.Builder()) indices, positions, segments, masks, labels = bert_add_inputs( args, model) inputs = [indices, positions, segments, masks, labels] embedding_dict, positional_dict = model.get_model_embeddings() shapeOf = model.builder.getTensorShape inputs = reduce(chain, inputs[3:], inputs[:3]) tensor_shapes = [(tensorId, shapeOf(tensorId)) for tensorId in inputs] dataset = get_bert_dataset(args, tensor_shapes) return dataset
def popart_result_and_model(config, weight_transposed, is_bwd=False): """Run popart model based on config. Args: config (BertConfig): Popart config. weight_transposed: Construct embedding dict transposed. is_bwd (bool, optional): Construct training graph if True, else inference graph. Defaults to False. Returns: Tuple: Gathered numpy data, outputs from model, proto, post_proto """ user_options = {} popart_model = Bert(config) builder = popart_model.builder indices_len = config.micro_batch_size * config.sequence_length sequence_info = popart.TensorInfo("UINT32", [indices_len]) indices = builder.addInputTensor(sequence_info) data = { indices: np.random.randint(0, config.vocab_length, (indices_len)).astype(np.uint32) } output = popart_model.word_embedding_serialized(indices, num_splits) if is_bwd: l1_loss = popart_model.builder.aiGraphcore.l1loss( [output], 0.1, debugContext="l1LossVal", reduction=popart.ReductionType.Sum) proto = builder.getModelProto() optimizer = popart.ConstSGD(0.01) outputs, post_proto = run_py(proto, data, (output, l1_loss), loss=l1_loss, optimizer=optimizer, user_options=user_options) else: proto = builder.getModelProto() outputs, post_proto = run_py(proto, data, output, user_options=user_options) return [data[indices]], outputs, proto, post_proto
def test_nsp_fwd(custom_ops): # ------------------- PopART -------------------- config = BertConfig(task="NSP", vocab_length=9728, num_layers=2, micro_batch_size=1, hidden_size=768, sequence_length=128, activation_type="relu", popart_dtype="FLOAT", no_dropout=True, no_attn_dropout=True, inference=True, no_mask=True, mask_tokens=0, split_qkv=False) popart_model = Bert(config) # ------------------- PyTorch ------------------------- torch_model = BertForNextSentencePrediction( TorchBertConfig(config.vocab_length, config.hidden_size, num_hidden_layers=config.num_layers, num_attention_heads=config.attention_heads, intermediate_size=config.ff_size, hidden_act=config.activation_type, max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps, mask_tokens=config.mask_tokens, num_labels=2)) fwd_graph(popart_model, torch_model, NSP_MAPPING, transform=NSP_TRANSFORM)
def test_nsp_fwd(custom_ops): # ------------------- PopART -------------------- builder = popart.Builder( opsets={"ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1}) config = BertConfig(task="NSP", vocab_length=9728, num_layers=2, batch_size=1, hidden_size=768, sequence_length=128, activation_type="relu", popart_dtype="FLOAT", no_dropout=True, custom_ops=["gather", "attention"], inference=True) popart_model = Bert(config, builder=builder) # ------------------- PyTorch ------------------------- torch_model = BertForNextSentencePrediction( TorchBertConfig(config.vocab_length, config.hidden_size, num_hidden_layers=config.num_layers, num_attention_heads=config.attention_heads, intermediate_size=config.ff_size, hidden_act=config.activation_type, max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps, mask_tokens=config.mask_tokens, num_labels=2)) fwd_graph(popart_model, torch_model, mapping=NSP_MAPPING, transform=NSP_TRANSFORM)
def test_pretraining_fwd(custom_ops): # ------------------- PopART -------------------- builder = popart.Builder(opsets={"ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1}) config = BertConfig(task="PRETRAINING", vocab_length=9728, num_layers=2, batch_size=1, hidden_size=768, sequence_length=128, popart_dtype="FLOAT", activation_type="relu", no_dropout=True, custom_ops=["gather", "attention"], inference=True) popart_model = Bert(config, builder=builder) # ------------------- PyTorch ------------------------- torch_model = BertForMaskedLM( TorchBertConfig(config.vocab_length, config.hidden_size, num_hidden_layers=config.num_layers, num_attention_heads=config.attention_heads, intermediate_size=config.ff_size, hidden_act="relu", max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps, mask_tokens=config.mask_tokens)) fwd_graph(popart_model, torch_model, mapping=onnx_torch_mapping, transform=onnx_torch_tform)
def popart_result_and_model(popart_config, is_bwd=False, momentum=0.0): popart_model = Bert(popart_config) input_info = popart.TensorInfo(popart_config.popart_dtype, [ popart_config.micro_batch_size * popart_config.sequence_length, popart_config.hidden_size ]) input_tensor = popart_model.builder.addInputTensor(input_info) data = { input_tensor: np.random.normal(0, 0.02, input_info.shape()).astype(popart_config.dtype) } output = popart_model.feed_forward(input_tensor) if is_bwd: l1 = popart_model.builder.aiGraphcore.l1loss( [output], 0.1, debugContext="l1LossVal", reduction=popart.ReductionType.Sum) proto = popart_model.builder.getModelProto() if momentum > 0.0: optimizer = popart.SGD({ "defaultLearningRate": (lr, False), "defaultMomentum": (momentum, False), "defaultWeightDecay": (0.0, False) }) else: optimizer = popart.ConstSGD(lr) outputs, post_proto = run_py(proto, data, (output, l1), loss=l1, optimizer=optimizer, num_reps=num_reps_bwd) else: proto = popart_model.builder.getModelProto() outputs, post_proto = run_py(proto, data, output) return data[input_tensor], outputs, proto, post_proto
def test_pretraining_bwd(custom_ops, opt_type): # ------------------- PopART -------------------- config = BertConfig(task="PRETRAINING", encoder_start_ipu=1, vocab_length=1024, micro_batch_size=1, hidden_size=64, attention_heads=2, sequence_length=20, max_positional_length=20, mask_tokens=2, popart_dtype="FLOAT", activation_type="relu", no_dropout=True, no_attn_dropout=True, update_embedding_dict=True, no_cls_layer=True, no_mask=True, split_qkv=(opt_type == "LAMB")) popart_model = Bert(config) # ------------------- PyTorch ------------------------- torch_model = BertForMaskedLM( TorchBertConfig(config.vocab_length, config.hidden_size, num_hidden_layers=config.num_layers, num_attention_heads=config.attention_heads, intermediate_size=config.ff_size, hidden_act="relu", max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps, update_embedding_dict=True, mask_tokens=config.mask_tokens)) l1_lambda = 0.1 def popart_loss_fn(logits): loss = popart_model.builder.aiGraphcore.l1loss( [logits[0]], l1_lambda, debugContext="l1LossVal", reduction=popart.ReductionType.Sum) popart_model.builder.virtualGraph(loss, popart_model.mlm_scope.virtualGraph) return loss bwd_graph( popart_model, torch_model, popart_loss_fn=popart_loss_fn, torch_loss_fn=lambda logits: l1_lambda * torch.norm(logits[0], 1), mapping={}, transform=onnx_torch_tform, opt_type=opt_type)
def test_nsp_bwd(custom_ops): # ------------------- PopART -------------------- builder = popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }) config = BertConfig(task="NSP", vocab_length=9728, num_layers=1, batch_size=1, hidden_size=768, sequence_length=128, activation_type="relu", popart_dtype="FLOAT", no_dropout=True, custom_ops=["gather", "attention"]) popart_model = Bert(config, builder=builder) # ------------------- PyTorch ------------------------- torch_model = BertForNextSentencePrediction( TorchBertConfig(config.vocab_length, config.hidden_size, num_hidden_layers=config.num_layers, num_attention_heads=config.attention_heads, intermediate_size=config.ff_size, hidden_act="relu", max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps, mask_tokens=config.mask_tokens, num_labels=2)) def popart_loss_fn(outputs): loss = popart.L1Loss(outputs[0], "l1Loss", 0.1) loss.virtualGraph(popart_model.nsp_scope.virtualGraph) return [loss] def torch_loss_fn(outputs): return 0.1 * torch.norm(outputs[0], 1) bwd_graph(popart_model, torch_model, popart_loss_fn=popart_loss_fn, torch_loss_fn=torch_loss_fn, mapping={ "bert.pooler.dense.weight": "NSP/PoolW", "bert.pooler.dense.bias": "NSP/PoolB", "cls.seq_relationship.weight": "NSP/NspW", "cls.seq_relationship.bias": "NSP/NspB" }, transform={ "bert.pooler.dense.weight": np.transpose, "cls.seq_relationship.weight": np.transpose })
def popart_result_and_model(popart_config, weight_decay=0.0, lr=0.0, l1_lambda=0.0): popart_model = Bert(popart_config) builder = popart_model.builder input_info = popart.TensorInfo(popart_config.popart_dtype, [ popart_config.micro_batch_size * popart_config.sequence_length, popart_config.hidden_size ]) input_tensor = builder.addInputTensor(input_info) data = { input_tensor: np.random.normal(0, 0.02, input_info.shape()).astype(popart_config.dtype) } output = popart_model.feed_forward(input_tensor) l1 = builder.aiGraphcore.l1loss([output], l1_lambda, debugContext="l1LossVal", reduction=popart.ReductionType.Sum) proto = builder.getModelProto() iteration = MockIteration() args = MockArgs("SGD", lr, weight_decay) optimizer_factory = BaseOptimizerFactory(args, iteration, popart_model.tensors) optimizer = optimizer_factory.create() outputs, post_proto = run_py(proto, data, (output, l1), loss=l1, optimizer=optimizer) return data[input_tensor], outputs, proto, post_proto
def test_nsp_bwd(custom_ops, opt_type): # ------------------- PopART -------------------- config = BertConfig(task="NSP", vocab_length=2432, micro_batch_size=1, hidden_size=288, sequence_length=128, activation_type="relu", popart_dtype="FLOAT", no_dropout=True, no_attn_dropout=True, no_mask=True, update_embedding_dict=True, split_qkv=(opt_type == "LAMB")) popart_model = Bert(config) # ------------------- PyTorch ------------------------- torch_model = BertForNextSentencePrediction( TorchBertConfig(config.vocab_length, config.hidden_size, num_hidden_layers=config.num_layers, num_attention_heads=config.attention_heads, intermediate_size=config.ff_size, hidden_act=config.activation_type, max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps, mask_tokens=config.mask_tokens, update_embedding_dict=True, num_labels=2)) l1_lambda = 0.1 def popart_loss_fn(outputs): loss = popart_model.builder.aiGraphcore.l1loss( [outputs[0]], l1_lambda, debugContext="l1LossVal", reduction=popart.ReductionType.Sum) popart_model.builder.virtualGraph(loss, popart_model.nsp_scope.virtualGraph) return loss def torch_loss_fn(outputs): return l1_lambda * torch.norm(outputs[0], 1) bwd_graph(popart_model, torch_model, popart_loss_fn=popart_loss_fn, torch_loss_fn=torch_loss_fn, mapping=NSP_MAPPING, transform=NSP_TRANSFORM, opt_type=opt_type)
def test_pretraining_bwd(custom_ops): # ------------------- PopART -------------------- builder = popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }) config = BertConfig(task="PRETRAINING", vocab_length=9728, projection_serialization_steps=4, num_layers=1, batch_size=1, hidden_size=768, sequence_length=128, popart_dtype="FLOAT", activation_type="relu", no_dropout=True, update_embedding_dict=False) popart_model = Bert(config, builder=builder) # ------------------- PyTorch ------------------------- torch_model = BertForMaskedLM( TorchBertConfig(config.vocab_length, config.hidden_size, num_hidden_layers=config.num_layers, num_attention_heads=config.attention_heads, intermediate_size=config.ff_size, hidden_act="relu", max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps, mask_tokens=config.mask_tokens)) l1_lambda = 0.1 def popart_loss_fn(logits): loss = builder.aiGraphcore.l1loss([logits[0]], l1_lambda, debugPrefix="l1LossVal", reduction=popart.ReductionType.Sum) builder.virtualGraph(loss, popart_model.mlm_scope.virtualGraph) return loss bwd_graph( popart_model, torch_model, popart_loss_fn=popart_loss_fn, torch_loss_fn=lambda logits: l1_lambda * torch.norm(logits[0], 1), mapping=onnx_torch_mapping, transform=onnx_torch_tform)
def test_nsp_bwd(custom_ops): # ------------------- PopART -------------------- builder = popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }) config = BertConfig(task="NSP", vocab_length=9728, num_layers=1, batch_size=1, hidden_size=768, sequence_length=128, activation_type="relu", popart_dtype="FLOAT", no_dropout=True) popart_model = Bert(config, builder=builder) # ------------------- PyTorch ------------------------- torch_model = BertForNextSentencePrediction( TorchBertConfig(config.vocab_length, config.hidden_size, num_hidden_layers=config.num_layers, num_attention_heads=config.attention_heads, intermediate_size=config.ff_size, hidden_act=config.activation_type, max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps, mask_tokens=config.mask_tokens, num_labels=2)) def popart_loss_fn(outputs): loss = builder.aiGraphcore.l1loss([outputs[0]], 0.1, debugPrefix="l1Loss", reduction=popart.ReductionType.Sum) builder.virtualGraph(loss, popart_model.nsp_scope.virtualGraph) return loss def torch_loss_fn(outputs): return 0.1 * torch.norm(outputs[0], 1) bwd_graph(popart_model, torch_model, popart_loss_fn=popart_loss_fn, torch_loss_fn=torch_loss_fn, mapping=NSP_MAPPING, transform=NSP_TRANSFORM)
def test_squad_fwd(): # ------------------- PopART -------------------- builder = popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }) config = BertConfig(task="SQUAD", vocab_length=9728, num_layers=2, batch_size=1, hidden_size=768, sequence_length=128, activation_type="relu", popart_dtype="FLOAT", no_dropout=True, custom_ops=[], inference=True) popart_model = Bert(config, builder=builder) # ------------------- PyTorch ------------------------- torch_model = BertForQuestionAnswering( TorchBertConfig(config.vocab_length, config.hidden_size, num_hidden_layers=config.num_layers, num_attention_heads=config.attention_heads, intermediate_size=config.ff_size, hidden_act="relu", max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps, mask_tokens=config.mask_tokens, num_labels=2)) fwd_graph(popart_model, torch_model, mapping={ "cls.transform.dense.weight": "CLS/LMPredictionW", "cls.transform.dense.bias": "CLS/LMPredictionB", "cls.transform.LayerNorm.weight": "CLS/Gamma", "cls.transform.LayerNorm.bias": "CLS/Beta", "qa_outputs.weight": "Squad/SquadW", "qa_outputs.bias": "Squad/SquadB" }, transform={ "cls.transform.dense.weight": np.transpose, "qa_outputs.weight": np.transpose })
def test_pretraining_fwd(custom_ops): # ------------------- PopART -------------------- config = BertConfig(task="PRETRAINING", encoder_start_ipu=1, vocab_length=1024, micro_batch_size=1, hidden_size=64, attention_heads=2, sequence_length=20, max_positional_length=20, mask_tokens=2, popart_dtype="FLOAT", activation_type="relu", no_dropout=True, no_attn_dropout=True, no_cls_layer=False, inference=True, no_mask=True, split_qkv=False) popart_model = Bert(config) # ------------------- PyTorch ------------------------- torch_model = BertForMaskedLM( TorchBertConfig(config.vocab_length, config.hidden_size, num_hidden_layers=config.num_layers, num_attention_heads=config.attention_heads, intermediate_size=config.ff_size, hidden_act="relu", max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps, mask_tokens=config.mask_tokens, no_cls_layer=config.no_cls_layer)) fwd_graph(popart_model, torch_model, mapping=ONNX_TORCH_MAPPING, transform=onnx_torch_tform)
def test_squad_fwd(custom_ops): # ------------------- PopART -------------------- config = BertConfig(task="SQUAD", encoder_start_ipu=1, vocab_length=1024, micro_batch_size=1, hidden_size=64, attention_heads=2, sequence_length=20, max_positional_length=20, activation_type="relu", popart_dtype="FLOAT", no_dropout=True, no_attn_dropout=True, inference=True, no_mask=True, split_qkv=False, squad_single_output=False) popart_model = Bert(config) # ------------------- PyTorch ------------------------- torch_model = BertForQuestionAnswering( TorchBertConfig(config.vocab_length, config.hidden_size, num_hidden_layers=config.num_layers, num_attention_heads=config.attention_heads, intermediate_size=config.ff_size, hidden_act="relu", max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps, mask_tokens=2, num_labels=2)) fwd_graph(popart_model, torch_model, mapping=ONNX_TORCH_MAPPING, transform={"qa_outputs.weight": np.transpose})
def pooled_validation_run(bert_args, config, initializers, checkpoint_paths, num_processes=1, available_ipus=16): logger.info("Building Model") model = Bert(config, builder=popart.Builder( opsets={"ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1}), initializers=initializers) indices, positions, segments, masks, labels = bert_add_inputs( bert_args, model) logits = bert_logits_graph(model, indices, positions, segments, masks, bert_args.execution_mode) inputs = [indices, positions, segments, *masks] outputs = bert_add_logit_outputs(model, logits) with tempfile.TemporaryDirectory() as temp_results_path: # Inject the checkpoint-specific squad results directory into the dataset args otherwise # they overwrite each other when multithreaded bert_args.squad_results_dir = temp_results_path dataset = get_bert_dataset( model, bert_args, [indices, positions, segments, masks, labels]) logger.info(f"Dataset length: {len(dataset)}") data_flow = popart.DataFlow(dataset.batches_per_step, outputs) iteration = Iteration( bert_args, batches_per_step=dataset.batches_per_step, steps_per_epoch=len(dataset), writer=None, recording_steps=bert_args.aggregate_metrics_over_steps) request_ipus, _ = calc_required_ipus(bert_args, model) if request_ipus * num_processes > available_ipus: raise ValueError( "Cannot run with requested number of processes - too many IPUs required") device = acquire_device(bert_args, request_ipus) session, anchors = bert_inference_session( model, bert_args, data_flow, device) model_results = recursive_defaultdict() for path in checkpoint_paths: session.resetHostWeights(str(path.absolute())) session.weightsFromHost() logger.info(f"Inference started for checkpoint: {path.absolute()}") result = run_inference_extract_result(bert_args, session, dataset, inputs, logits, anchors, iteration) result_into_recursive_path(model_results, path, bert_args.checkpoint_dir, result) device.detach() return model_results
def test_embedding_fwd(custom_ops): # ------------------- PopART -------------------- config = BertConfig(task="SQUAD", vocab_length=9728, micro_batch_size=1, hidden_size=768, sequence_length=128, activation_type='relu', popart_dtype="FLOAT", no_dropout=True, inference=True) popart_model = Bert(config) sequence_info = popart.TensorInfo( "UINT32", [config.micro_batch_size * config.sequence_length]) indices = popart_model.builder.addInputTensor(sequence_info) positions = popart_model.builder.addInputTensor(sequence_info) segments = popart_model.builder.addInputTensor(sequence_info) data = { indices: np.random.randint( 0, config.vocab_length, (config.micro_batch_size * config.sequence_length)).astype( np.uint32), positions: np.random.randint( 0, config.max_positional_length, (config.micro_batch_size * config.sequence_length)).astype( np.uint32), segments: np.random.randint( 0, 2, (config.micro_batch_size * config.sequence_length)).astype( np.uint32) } user_options = {"enableStochasticRounding": True} with popart_model.builder.nameScope("Embedding"): output = popart_model.embedding(indices, positions, segments) proto = popart_model.builder.getModelProto() outputs, post_proto = run_py(proto, data, output, user_options=user_options) # ----------------- PopART -> PyTorch ---------------- proto = onnx.load_model_from_string(proto) inputs = [ data[t].reshape(config.micro_batch_size, config.sequence_length).astype(np.int32) for t in [indices, positions, segments] ] # ------------------- PyTorch ------------------------- torch_model = BertEmbeddings( TorchBertConfig(config.vocab_length, config.hidden_size, max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps)) torch_model.eval() copy_weights_to_torch(torch_model, proto, TORCH_TO_ONNX, {}) torch_outputs = run_fwd_model(inputs, torch_model) check_tensors(torch_outputs, outputs, margin=5e-7)
def test_embedding_bwd(custom_ops): # ------------------- PopART -------------------- config = BertConfig(task="SQUAD", vocab_length=9728, micro_batch_size=1, hidden_size=768, sequence_length=128, activation_type='relu', popart_dtype="FLOAT", no_dropout=True, update_embedding_dict=True) popart_model = Bert(config) # Prevent virtualGraph attributes being added to the ops sequence_info = popart.TensorInfo( "UINT32", [config.micro_batch_size * config.sequence_length]) indices = popart_model.builder.addInputTensor(sequence_info) positions = popart_model.builder.addInputTensor(sequence_info) segments = popart_model.builder.addInputTensor(sequence_info) data = { indices: np.random.randint( 0, config.vocab_length, (config.micro_batch_size * config.sequence_length)).astype( np.uint32), positions: np.random.randint( 0, config.max_positional_length, (config.micro_batch_size * config.sequence_length)).astype( np.uint32), segments: np.random.randint( 0, 2, (config.micro_batch_size * config.sequence_length)).astype( np.uint32) } optimizer = popart.ConstSGD(0.01) l1_lambda = 0.1 with popart_model.builder.nameScope("Embedding"): output = popart_model.embedding(indices, positions, segments) l1 = popart_model.builder.aiGraphcore.l1loss( [output], l1_lambda, debugContext="l1LossVal", reduction=popart.ReductionType.Sum) num_reps = 5 proto = popart_model.builder.getModelProto() outputs, post_proto = run_py(proto, data, output, ipus=1, loss=l1, num_reps=num_reps, optimizer=optimizer) # ----------------- PopART -> PyTorch ---------------- proto = onnx.load_model_from_string(proto) inputs = [ data[t].reshape(config.micro_batch_size, config.sequence_length).astype(np.int32) for t in [indices, positions, segments] ] # ------------------- PyTorch ------------------------- torch_model = BertEmbeddings( TorchBertConfig(config.vocab_length, config.hidden_size, max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps, update_embedding_dict=config.update_embedding_dict)) # Turn off dropout torch_model.eval() copy_weights_to_torch(torch_model, proto, TORCH_TO_ONNX, {}) optim = torch.optim.SGD(torch_model.parameters(), 0.01) for _ in range(num_reps): torch_output = torch_model( *[torch.from_numpy(t).long() for t in inputs]) torch_loss = l1_lambda * torch.norm(torch_output, 1) torch_loss.backward() optim.step() optim.zero_grad() torch_outputs = [torch_output.detach().numpy()] check_tensors(torch_outputs, outputs, margin=7e-6) check_model(torch_model, post_proto, TORCH_TO_ONNX, {}, margin=7e-06)
def main(args): set_library_seeds(args.seed) config = bert_config_from_args(args) initializers = bert_pretrained_initialisers(config, args) logger.info("Building Model") # Specifying ai.onnx opset9 for the slice syntax # TODO: Change slice to opset10 model = Bert(config, builder=popart.Builder( opsets={"ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1}), initializers=initializers) indices, positions, segments, masks, labels = bert_add_inputs(args, model) logits = bert_logits_graph(model, indices, positions, segments, masks) if args.inference: outputs = bert_add_infer_outputs(model, logits) losses = [] writer = None else: predictions, probs = bert_infer_graph(model, logits) losses = bert_loss_graph(model, probs, labels) outputs = bert_add_validation_outputs(model, predictions, losses) writer = bert_writer(args) dataset = get_bert_dataset(model, args, [indices, positions, segments, masks, labels]) logger.info(f"Dataset length: {len(dataset)}") data_flow = popart.DataFlow(dataset.batches_per_step, outputs) iteration = Iteration( args, batches_per_step=dataset.batches_per_step, steps_per_epoch=len(dataset), writer=writer, recording_steps=args.aggregate_metrics_over_steps) request_ipus, required_ipus = calc_required_ipus(args, model) device = acquire_device(args, request_ipus) if args.inference: session, anchors = bert_inference_session(model, args, data_flow, losses, device) logger.info("Inference Started") bert_infer_loop(args, session, dataset, logits, anchors, iteration) device.detach() else: if not args.no_training: optimizer_factory = ScheduledOptimizerFactory(args, iteration, model.pipeline_stage_tensors) session, anchors = bert_training_session(model, args, data_flow, losses, device, optimizer_factory) logger.info("Training Started") bert_train_loop(args, session, writer, dataset, labels, predictions, losses, anchors, iteration, optimizer_factory) device.detach() logger.info("Training Finished") if not args.no_validation: logger.info("Doing Validation") main(utils.get_validation_args(args)) return session, iteration
def main(args): set_library_seeds(args.seed) config = bert_config_from_args(args) initializers = bert_pretrained_initialisers(config, args) logger.info("Building Model") # Specifying ai.onnx opset9 for the slice syntax model = Bert(config, builder=popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }), initializers=initializers, execution_mode=args.execution_mode) # If config.host_embedding is enabled, indices and positions will have the matrices instead of the index vector. indices, positions, segments, masks, labels = bert_add_inputs(args, model) logits = bert_logits_graph(model, indices, positions, segments, masks) if args.inference: predictions = None losses = [] if args.task == "PRETRAINING": # If this is a pretraining session, labels for NSP and MLM are already within the dataset, # so we can always calculate prediction performance predictions, _ = bert_infer_graph(model, logits, include_probs=False) if args.inference_lm_perplexity: losses = bert_perplexity_graph(model, logits, labels) outputs = bert_add_validation_outputs(model, predictions, losses) else: if args.inference_lm_perplexity: raise RuntimeError( "Masked LM perplexity is only supported in pretraining.") outputs = bert_add_logit_outputs(model, logits) writer = None else: predictions, probs = bert_infer_graph(model, logits) losses = bert_loss_graph(model, probs, labels) outputs = bert_add_validation_outputs(model, predictions, losses) writer = bert_writer(args) embedding_dict, positional_dict = model.get_model_embeddings() dataset = get_bert_dataset(model, args, [indices, positions, segments, masks, labels], embedding_dict, positional_dict, config.host_embedding == "MERGE") logger.info(f"Dataset length: {len(dataset)}") data_flow = popart.DataFlow(dataset.batches_per_step, outputs) iteration = Iteration(args, batches_per_step=dataset.batches_per_step, steps_per_epoch=len(dataset), writer=writer, recording_steps=args.aggregate_metrics_over_steps) request_ipus, required_ipus = calc_required_ipus(args, model) device = acquire_device(args, request_ipus) if args.inference: session, anchors = bert_inference_session(model, args, data_flow, device) logger.info("Inference Started") inputs = [indices, positions, segments, *masks, *labels] bert_infer_loop(args, session, dataset, inputs, logits, anchors, labels, predictions, losses, iteration) device.detach() else: if not args.no_training: optimizer_factory = ScheduledOptimizerFactory( args, iteration, model.tensors) session, anchors = bert_training_session(model, args, data_flow, losses, device, optimizer_factory) logger.info("Training Started") bert_train_loop(args, session, writer, dataset, labels, predictions, losses, anchors, iteration, optimizer_factory) device.detach() logger.info("Training Finished") return session, iteration
def run_embedding_layer(args): set_library_seeds(args.seed) config = bert_config_from_args(args) initializers = bert_pretrained_initialisers(config, args) logger.info("Building Model") # Specifying ai.onnx opset9 for the slice syntax # TODO: Change slice to opset10 model = Bert(config, builder=popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }), initializers=initializers, execution_mode=args.execution_mode) # If config.host_embedding is enabled, indices and positions will have the matrices instead of the index vector. indices, positions, segments, masks, labels = bert_add_inputs(args, model) logits = tuple([model.embedding(indices, positions, segments)]) if args.inference: outputs = bert_add_logit_outputs(model, logits) writer = None dataset = get_bert_dataset( model, args, [indices, positions, segments, masks, labels]) data_flow = popart.DataFlow(dataset.batches_per_step, outputs) iteration = Iteration( args, steps_per_epoch=len(dataset), writer=writer, recording_steps=args.aggregate_metrics_over_steps) request_ipus = bert_required_ipus(args, model) device = acquire_device(args, request_ipus) session, anchors = bert_inference_session(model, args, data_flow, device) logger.info("Inference Started") inputs = [indices, positions, segments, *masks] """bert_infer_loop(args, session, dataset, inputs, logits, anchors, iteration)""" save_results = args.task == "SQUAD" and not (args.synthetic_data or args.generated_data) start_times = defaultdict(list) end_times = defaultdict(list) # Create the stepio once outside of the inference loop: static_data = {} if args.low_latency_inference and args.task == "SQUAD": stepio = create_callback_stepio(static_data, anchors, start_times, end_times, dataset.batches_per_step) else: stepio = None output = [] logger.info(dataset) for data in dataset: static_data.update({t: data[t] for t in inputs}) result = bert_process_infer_data(args, session, static_data, anchors, logits, iteration, start_times, end_times, stepio) if save_results: output.append(result) break device.detach() return output return None
def test_embedding_fwd(custom_ops): # ------------------- PopART -------------------- builder = popart.Builder(opsets={ "ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1 }) config = BertConfig(vocab_length=9728, batch_size=1, hidden_size=768, sequence_length=128, activation_type='relu', popart_dtype="FLOAT", no_dropout=True, custom_ops=['gather'], inference=True) popart_model = Bert(config, builder=builder) # Prevent virtualGraph attributes being added to the ops. popart_model.embedding_scope = popart_model.device_scope(None, None) popart_model.embedding_split_scope = popart_model.embedding_scope sequence_info = popart.TensorInfo( "UINT32", [config.batch_size * config.sequence_length]) indices = builder.addInputTensor(sequence_info) positions = builder.addInputTensor(sequence_info) segments = builder.addInputTensor(sequence_info) data = { indices: np.random.randint(0, config.vocab_length, (config.batch_size * config.sequence_length)).astype( np.uint32), positions: np.random.randint(0, config.max_positional_length, (config.batch_size * config.sequence_length)).astype( np.uint32), segments: np.random.randint(0, 2, (config.batch_size * config.sequence_length)).astype( np.uint32) } # Use the custom embedding for layout output = popart_model.embedding(indices, positions, segments) proto = builder.getModelProto() outputs, post_proto = run_py( proto, data, output, user_options={"enableStochasticRounding": True}) # ----------------- PopART -> PyTorch ---------------- proto = onnx.load_model_from_string(proto) inputs = [ data[t].reshape(config.batch_size, config.sequence_length).astype(np.int32) for t in [indices, positions, segments] ] torch_to_onnx = { "word_embeddings.weight": "Embedding_Dict", "position_embeddings.weight": "Positional_Dict", "token_type_embeddings.weight": "Segment_Dict", "LayerNorm.weight": "Gamma", "LayerNorm.bias": "Beta" } transposed_weights = { "word_embeddings.weight": np.transpose, "position_embeddings.weight": np.transpose, } # ------------------- PyTorch ------------------------- torch_model = BertEmbeddings( TorchBertConfig(config.vocab_length, config.hidden_size, max_position_embeddings=config.max_positional_length, layer_norm_eps=config.layer_norm_eps)) torch_model.eval() copy_weights_to_torch(torch_model, proto, torch_to_onnx, transposed_weights) torch_outputs = run_fwd_model(inputs, torch_model) check_tensors(torch_outputs, outputs)