def __init__(self, config): super().__init__() self.config = config self.model = transformers.BertForPreTraining(config) for layer in self.model.bert.encoder.layer: layer.attention.self = BertFusedSelfAttention(config) if not self.config.pred_head_transform: # Disable prediction head transform self.model.cls.predictions.transform = nn.Identity() if self.config.embedding_serialization_factor > 1: self.model.cls.predictions.decoder = SerializedLinear( self.config.hidden_size, self.config.vocab_size, self.config.embedding_serialization_factor, mode=poptorch.MatMulSerializationMode.OutputChannels) self.model.tie_weights() layer_ipu = _get_layer_ipu(config.layers_per_ipu) logger("-------------------- Device Allocation --------------------") logger("Embedding --> IPU 0") self.model.bert.embeddings = poptorch.BeginBlock( self.model.bert.embeddings, "Embedding", ipu_id=0) for index, layer in enumerate(self.model.bert.encoder.layer): ipu = layer_ipu[index] layer = RecomputationCheckpoint( layer) if config.recompute_checkpoint_every_layer else layer self.model.bert.encoder.layer[index] = poptorch.BeginBlock( layer, f"Encoder{index}", ipu_id=ipu) logger(f"Encoder {index:<2} --> IPU {ipu}") logger("Pooler --> IPU 0") self.model.bert.pooler = poptorch.BeginBlock(self.model.bert.pooler, "Pooler", ipu_id=0) logger("Classifier --> IPU 0") self.model.cls = poptorch.BeginBlock(self.model.cls, "Classifier", ipu_id=0) logger("-----------------------------------------------------------")
def test_bert_medium_result(): torch.manual_seed(42) # Bert small. pretrained_weights = 'mrm8488/bert-medium-finetuned-squadv2' model = transformers.BertForQuestionAnswering.from_pretrained( pretrained_weights) tokenizer = transformers.BertTokenizer.from_pretrained( pretrained_weights, return_token_type_ids=True) context = """Edinburgh is Scotland's compact, hilly capital.""" question = "What is the capital of Scotland?" encoding = tokenizer.encode_plus(question, context) mask = encoding["attention_mask"] ins = encoding["input_ids"] input_ids = torch.tensor([ins, ins]) attention_mask = torch.tensor([mask, mask]) start_scores_native, end_scores_native = model( input_ids, attention_mask=attention_mask) opts = poptorch.Options() opts.deviceIterations(2) model.bert.embeddings.position_embeddings = poptorch.BeginBlock( model.bert.embeddings.position_embeddings, ipu_id=1) inference_model = poptorch.inferenceModel(model, opts) start_score_pop, end_scores_pop = inference_model(input_ids, attention_mask) # Longer sequences begin to accumulate more floating point error. assert torch.allclose(start_scores_native, start_score_pop, rtol=1e-02, atol=1e-02) assert torch.allclose(end_scores_native, end_scores_pop, rtol=1e-02, atol=1e-02) assert torch.argmax(start_score_pop), torch.argmax(start_scores_native) assert torch.argmax(end_scores_pop), torch.argmax(end_scores_native) # Convert to string (Only check the first result as we've already established the two were identical) ans_tokens = ins[torch.argmax(start_score_pop[0] ):torch.argmax(end_scores_pop[0]) + 1] answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens) answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens) assert answer_tokens_to_string == 'edinburgh'
def parallelize(self): self._hooks = [] self.vit.embeddings = poptorch.BeginBlock(self.vit.embeddings, "Embedding", ipu_id=0) layer_ipu = _get_layer_ipu(self.config.layers_per_ipu) for index, layer in enumerate(self.vit.encoder.layer): if self.config.recompute_checkpoint_every_layer: # Put checkpoints on every encoder layer h = recomputation_checkpoint(layer) self._hooks.append(h) ipu = layer_ipu[index] self.vit.encoder.layer[index] = poptorch.BeginBlock( layer, f"Encoder{index}", ipu_id=ipu) self.vit.layernorm = poptorch.BeginBlock(self.vit.layernorm, "LayerNorm", ipu_id=3) self.classifier = poptorch.BeginBlock(self.classifier, "Classifier", ipu_id=3) return self
def pipeline_model(model, pipeline_splits): """ Split the model into stages. """ for name, modules in model.named_modules(): name = name.replace('.', '/') if name in pipeline_splits: logging.info('--------') logging.info(name) for split_idx, split in enumerate(pipeline_splits): split_tokens = split.split('/') logging.info(f'Processing pipeline split {split_tokens}') parent, node, field_or_idx_str = get_module_and_parent_by_name( model, split_tokens) if parent is None: logging.warn(f'Split {split} not found') else: replace_layer( parent, field_or_idx_str, poptorch.BeginBlock(ipu_id=split_idx + 1, layer_to_call=node))
def __init__(self, *, dim, vae, num_text_tokens=10000, text_seq_len=256, depth, heads=8, dim_head=64, attn_dropout=0., ff_dropout=0, sparse_attn=False, attn_types=None, loss_img_weight=7, sandwich_norm=False, embedding_ipu_id=0, embedding_serialization_factor=1, layers_per_ipu=[0, 0, 8, 8], cls_ipu_id=None, fp16=False): super().__init__() self.model = DALLE(dim=dim, vae=vae, num_text_tokens=num_text_tokens, text_seq_len=text_seq_len, depth=depth, heads=heads, dim_head=dim_head, attn_dropout=attn_dropout, ff_dropout=ff_dropout, sparse_attn=sparse_attn, attn_types=attn_types, loss_img_weight=loss_img_weight, sandwich_norm=sandwich_norm, fp16=fp16) assert (sum(layers_per_ipu) == depth) if embedding_serialization_factor > 1: self.model.text_emb = SerializedEmbedding( self.model.num_text_tokens, dim, embedding_serialization_factor) self.model.to_logits[1] = SerializedLinear( dim, self.model.total_tokens, factor=embedding_serialization_factor) self.model.vae = poptorch.BeginBlock(self.model.vae, "VAE", ipu_id=0) self.model.image_emb = poptorch.BeginBlock(self.model.image_emb, "image_emb", ipu_id=embedding_ipu_id) layer = 0 for i in range(len(layers_per_ipu)): if layers_per_ipu[i] > 0: self.model.transformer.layers[layer] = poptorch.BeginBlock( self.model.transformer.layers[layer], "Transformer_" + str(layer), ipu_id=i) layer = layer + layers_per_ipu[i] if cls_ipu_id is not None: self.model.to_logits = poptorch.BeginBlock(self.model.to_logits, "cls", ipu_id=cls_ipu_id)
# Pre-trained model and tokenizer. tokenizer = transformers.BertTokenizer.from_pretrained( 'mrm8488/bert-medium-finetuned-squadv2', return_token_type_ids=True) model = transformers.BertForQuestionAnswering.from_pretrained( 'mrm8488/bert-medium-finetuned-squadv2') # Parse command-line arguments. context, questions = read_inputs(options) num_questions = len(questions) batch_size = options.batch_size num_batches = num_questions // batch_size # Pipeline the model over two IPUs. You must have at least as many batches (questions) as you have IPUs. model.bert.embeddings.position_embeddings = poptorch.BeginBlock( layer_to_call=model.bert.embeddings.position_embeddings, ipu_id=1) # Wrap PyTorch model insde a PopTorch InferenceModel. This will make the model run on the IPU. opts = poptorch.Options().deviceIterations(batch_size) inference_model = poptorch.inferenceModel(model, options=opts) # Process inputs in batches. for batch_idx in range(num_batches): input_pairs = [ (questions[batch_size*batch_idx + i], context) for i in range(batch_size)] batched_encoding = tokenizer.batch_encode_plus( input_pairs, max_length=110,
the island of Great Britain, mainland Scotland has a 96 mile (154 km) border with England to the southeast and is otherwise surrounded by the Atlantic Ocean to the north and west, the North Sea to the northeast and the Irish Sea to the south. In addition, Scotland includes more than 790 islands; principally within the Northern Isles and the Hebrides archipelagos.""" questions = [ "How many islands are there in Scotland?", "What sea is to the south of Scotland", "Where is England in relation to Scotland?", "How long is the border between England and Scotland?" ] batches = len(questions) # Pipeline the model over two IPUs. You must have at least as many batches (questions) as you have IPUs. model.bert.embeddings.position_embeddings = poptorch.BeginBlock( model.bert.embeddings.position_embeddings, ipu_id=1) # Mark model for inference. opts = poptorch.Options().deviceIterations(batches) inference_model = poptorch.inferenceModel(model, opts) # Batch by the number of iterations so we fill the pipeline. encoding, input_ids, attention_mask = [None] * batches, [[None]] * batches, [ None ] * batches # Encode the query and context. batch_list, atten_list = [], [] # Encode each question for the IPU. for i in range(0, batches):
def warp_start_point(self, layer_pointer, ipu_id=0): layer_pointer = poptorch.BeginBlock(layer_pointer, ipu_id=ipu_id) return layer_pointer
# annotations_start import transformers import torch import poptorch # A bert model from hugging face. See the packaged BERT example for actual usage. pretrained_weights = 'mrm8488/bert-medium-finetuned-squadv2' model = transformers.BertForQuestionAnswering.from_pretrained( pretrained_weights) # A handy way of seeing the names of all the layers in the network. print(model) # All layers before "model.bert.encoder.layer[0]" will be on IPU 0 and all layers from # "model.bert.encoder.layer[0]" onwards (inclusive) will be on IPU 1. model.bert.encoder.layer[0] = poptorch.BeginBlock(model.bert.encoder.layer[0], ipu_id=1) # Now all layers before layer are on IPU 1 and this layer onward is on IPU 2 model.bert.encoder.layer[2] = poptorch.BeginBlock(model.bert.encoder.layer[2], ipu_id=2) # Finally all layers from this layer till the end of the network are on IPU 3. model.bert.encoder.layer[4] = poptorch.BeginBlock(model.bert.encoder.layer[4], ipu_id=3) # We must batch the data by at least the number of IPUs. Each IPU will still execute # whatever the model batch size is. data_batch_size = 4 # Create a poptorch.Options instance to override default options opts = poptorch.Options()