Python Bert Beispiele, bert_model.Bert Python Beispiele

Beispiel #1

0

Datei anzeigen

def popart_result_and_model(popart_config, is_bwd=False):
    builder = popart.Builder()
    popart_model = Bert(popart_config, builder=builder)

    input_info = popart.TensorInfo(popart_config.popart_dtype, [
        popart_config.batch_size * popart_config.sequence_length,
        popart_config.hidden_size
    ])
    input_tensor = builder.addInputTensor(input_info)

    data = {
        input_tensor:
        np.random.normal(0, 0.02,
                         input_info.shape()).astype(popart_config.dtype)
    }

    output = popart_model.feed_forward(input_tensor)
    proto = builder.getModelProto()

    if is_bwd:
        l1_lambda = 0.1
        l1 = popart.L1Loss(output, "l1LossVal", l1_lambda)
        optimizer = popart.ConstSGD(0.01)

        outputs, post_proto = run_py(proto,
                                     data, (output, l1.output(0)),
                                     loss=l1,
                                     optimizer=optimizer)
    else:
        outputs, post_proto = run_py(proto, data, output)

    return data[input_tensor], outputs, proto, post_proto

Beispiel #2

0

Datei anzeigen

Datei: bert_tf_loader.py Projekt: muzzynine/examples-1

def load_model_from_tf(
    file_path,
    is_checkpoint,
    config,
    indices,
    positions,
    segments,
    task,
    builder=popart.Builder(),
):
    """
    Loads weights, etc. from Tensorflow files into the Graphcore IPU BERT
    implementation.

    Can read either checkpoint files, or frozen graphs, according to the
    `is_checkpoint` flag, passed in as the second argument.

    Requires input tensors to be provided to initialise the graph build.

    The user can optionally pass in a builder object (e.g. for compatibility
    with an older ONNX version). If not provided, a default builder is created.
    """
    initializers = load_initializers_from_tf(file_path, is_checkpoint, config, task)
    popart_model = Bert(config, builder=builder, initializers=initializers)

    output_tensor = popart_model.build_graph(indices, positions, segments)
    proto = builder.getModelProto()
    return popart_model, proto, output_tensor

Beispiel #3

0

Datei anzeigen

def test_positional_encoding_data(position_length, hidden_size):
    if not tf.executing_eagerly():
        tf.enable_eager_execution()
        assert (tf.executing_eagerly())

    builder = popart.Builder(opsets={
        "ai.onnx": 9,
        "ai.onnx.ml": 1,
        "ai.graphcore": 1
    })
    config = BertConfig(vocab_length=9728,
                        batch_size=1,
                        hidden_size=hidden_size,
                        max_positional_length=position_length,
                        sequence_length=128,
                        activation_type='relu',
                        popart_dtype="FLOAT",
                        no_dropout=True,
                        positional_embedding_init_fn="TRANSFORMER",
                        inference=True)
    popart_model = Bert(config, builder=builder)

    shape = (config.max_positional_length, config.hidden_size)
    pos_pa = popart_model.generate_transformer_periodic_pos_data(
        config.dtype, shape)

    pos_tf = get_position_encoding_tf(shape[0], shape[1]).numpy()
    # Tensorflow broadcast multiplication seems to produce slightly different results
    # to numpy, hence the higher than expected error. The embeddings do correlate well
    # between the two despite this.
    assert (np.all(np.abs(pos_tf - pos_pa) < 5e-5))

Beispiel #4

0

Datei anzeigen

def test_simplified_position_encoding(position_length, hidden_size):

    builder = popart.Builder(opsets={
        "ai.onnx": 9,
        "ai.onnx.ml": 1,
        "ai.graphcore": 1
    })
    config = BertConfig(vocab_length=9728,
                        batch_size=1,
                        hidden_size=hidden_size,
                        max_positional_length=position_length,
                        sequence_length=128,
                        activation_type='relu',
                        popart_dtype="FLOAT",
                        no_dropout=True,
                        positional_embedding_init_fn="SIMPLIFIED",
                        inference=True)
    popart_model = Bert(config, builder=builder)

    shape = (config.max_positional_length, config.hidden_size)
    pa_data = popart_model.generate_simplified_periodic_pos_data(
        config.dtype, shape)

    bb_data = simplified_generator(position_length, hidden_size)
    assert (np.all(np.abs(bb_data - pa_data) < 1e-8))

Beispiel #5

0

Datei anzeigen

def popart_result_and_model(popart_config, weight_decay=0, lr=0, l1_lambda=0):
    builder = popart.Builder()
    popart_model = Bert(popart_config, builder=builder)

    input_info = popart.TensorInfo(popart_config.popart_dtype, [
        popart_config.batch_size * popart_config.sequence_length,
        popart_config.hidden_size
    ])
    input_tensor = builder.addInputTensor(input_info)

    data = {
        input_tensor:
        np.random.normal(0, 0.02,
                         input_info.shape()).astype(popart_config.dtype)
    }

    output = popart_model.feed_forward(input_tensor)
    proto = builder.getModelProto()

    l1 = popart.L1Loss(output, "l1LossVal", l1_lambda)

    iteration = MockIteration()
    args = MockArgs(lr, weight_decay)
    optimizer_factory = BaseOptimizerFactory(args, iteration,
                                             popart_model.tensors)
    optimizer = optimizer_factory.create()

    outputs, post_proto = run_py(proto,
                                 data, (output, l1.output(0)),
                                 loss=l1,
                                 optimizer=optimizer)

    return data[input_tensor], outputs, proto, post_proto

Beispiel #6

0

Datei anzeigen

 def create_dataset(args):
     # a simple copy of main bert.py until the dataset creation
     config = BertConfig()
     model = Bert(config, builder=popart.Builder())
     indices, positions, segments, masks, labels = bert_add_inputs(
         args, model)
     inputs = [indices, positions, segments, masks, labels]
     embedding_dict, positional_dict = model.get_model_embeddings()
     shapeOf = model.builder.getTensorShape
     inputs = reduce(chain, inputs[3:], inputs[:3])
     tensor_shapes = [(tensorId, shapeOf(tensorId)) for tensorId in inputs]
     dataset = get_bert_dataset(tensor_shapes,
                                input_file=args.input_files,
                                output_dir=args.output_dir,
                                sequence_length=args.sequence_length,
                                vocab_file=args.vocab_file,
                                vocab_length=args.vocab_length,
                                batch_size=args.batch_size,
                                batches_per_step=args.batches_per_step,
                                embedding_dict=embedding_dict,
                                positional_dict=positional_dict,
                                generated_data=args.generated_data,
                                is_training=False,
                                no_drop_remainder=True,
                                shuffle=args.shuffle,
                                mpi_size=args.mpi_size,
                                is_distributed=(args.mpi_size > 1))
     return dataset

Beispiel #7

0

Datei anzeigen

Datei: bert_multi_checkpoint_squad.py Projekt: muzzynine/examples-1

def training_run(bert_args, config, initializers, checkpoint_paths):
    logger.info("Building Model")
    model = Bert(config,
                 builder=popart.Builder(opsets={
                     "ai.onnx": 9,
                     "ai.onnx.ml": 1,
                     "ai.graphcore": 1
                 }),
                 initializers=initializers,
                 execution_mode=bert_args.execution_mode)

    indices, positions, segments, masks, labels = bert_add_inputs(
        bert_args, model)
    logits = bert_logits_graph(model, indices, positions, segments, masks,
                               bert_args.execution_mode)

    predictions, probs = bert_infer_graph(model, logits)
    losses = bert_loss_graph(model, probs, labels)
    outputs = bert_add_validation_outputs(model, predictions, losses)

    embedding_dict, positional_dict = model.get_model_embeddings()
    dataset = get_bert_dataset(model, bert_args,
                               [indices, positions, segments, masks, labels],
                               embedding_dict, positional_dict)

    data_flow = popart.DataFlow(dataset.batches_per_step, outputs)

    request_ipus, _ = calc_required_ipus(bert_args, model)
    device = acquire_device(bert_args, request_ipus)

    logger.info(f"Dataset length: {len(dataset)}")

    writer = bert_writer(bert_args)
    iteration = Iteration(
        bert_args,
        batches_per_step=dataset.batches_per_step,
        steps_per_epoch=len(dataset),
        writer=writer,
        recording_steps=bert_args.aggregate_metrics_over_steps)
    optimizer_factory = ScheduledOptimizerFactory(bert_args, iteration, "SGD",
                                                  model.tensors)
    session, anchors = bert_training_session(model, bert_args, data_flow,
                                             losses, device, optimizer_factory)

    for path in checkpoint_paths:
        ckpt_name = os.path.splitext(os.path.basename(path))[0]
        session.resetHostWeights(os.path.abspath(path))
        session.weightsFromHost()

        logger.info(f"Fine-tuning started for checkpoint: {path}")

        run_fine_tuning_store_ckpt(bert_args, model, ckpt_name, session,
                                   dataset, predictions, losses, labels,
                                   anchors)

    device.detach()

Beispiel #8

0

Datei anzeigen

def get_model_proto(config, initializers=None):
    model = Bert(config, pipeline=True, initializers=initializers)

    sequence_info = popart.TensorInfo("UINT32", [config.micro_batch_size * config.sequence_length])
    indices = model.builder.addInputTensor(sequence_info)
    positions = model.builder.addInputTensor(sequence_info)
    segments = model.builder.addInputTensor(sequence_info)

    output = model.build_graph(indices, positions, segments)

    return onnx.load_model_from_string(model.builder.getModelProto())

Beispiel #9

0

Datei anzeigen

Datei: data_test.py Projekt: WN1695173791/examples

 def create_dataset(args):
     # a simple copy of main bert.py until the dataset creation
     config = BertConfig()
     model = Bert(config, builder=popart.Builder())
     indices, positions, segments, masks, labels = bert_add_inputs(
         args, model)
     inputs = [indices, positions, segments, masks, labels]
     embedding_dict, positional_dict = model.get_model_embeddings()
     shapeOf = model.builder.getTensorShape
     inputs = reduce(chain, inputs[3:], inputs[:3])
     tensor_shapes = [(tensorId, shapeOf(tensorId)) for tensorId in inputs]
     dataset = get_bert_dataset(args, tensor_shapes)
     return dataset

Beispiel #10

0

Datei anzeigen

def popart_result_and_model(config, weight_transposed, is_bwd=False):
    """Run popart model based on config.

    Args:
        config (BertConfig): Popart config.
        weight_transposed: Construct embedding dict transposed.
        is_bwd (bool, optional): Construct training graph if True,
                                 else inference graph. Defaults to False.

    Returns:
        Tuple: Gathered numpy data, outputs from model, proto, post_proto
    """

    user_options = {}
    popart_model = Bert(config)
    builder = popart_model.builder

    indices_len = config.micro_batch_size * config.sequence_length
    sequence_info = popart.TensorInfo("UINT32", [indices_len])
    indices = builder.addInputTensor(sequence_info)
    data = {
        indices:
        np.random.randint(0, config.vocab_length,
                          (indices_len)).astype(np.uint32)
    }
    output = popart_model.word_embedding_serialized(indices, num_splits)

    if is_bwd:
        l1_loss = popart_model.builder.aiGraphcore.l1loss(
            [output],
            0.1,
            debugContext="l1LossVal",
            reduction=popart.ReductionType.Sum)
        proto = builder.getModelProto()
        optimizer = popart.ConstSGD(0.01)
        outputs, post_proto = run_py(proto,
                                     data, (output, l1_loss),
                                     loss=l1_loss,
                                     optimizer=optimizer,
                                     user_options=user_options)
    else:
        proto = builder.getModelProto()
        outputs, post_proto = run_py(proto,
                                     data,
                                     output,
                                     user_options=user_options)

    return [data[indices]], outputs, proto, post_proto

Beispiel #11

0

Datei anzeigen

def test_nsp_fwd(custom_ops):
    #  ------------------- PopART --------------------
    config = BertConfig(task="NSP",
                        vocab_length=9728,
                        num_layers=2,
                        micro_batch_size=1,
                        hidden_size=768,
                        sequence_length=128,
                        activation_type="relu",
                        popart_dtype="FLOAT",
                        no_dropout=True,
                        no_attn_dropout=True,
                        inference=True,
                        no_mask=True,
                        mask_tokens=0,
                        split_qkv=False)
    popart_model = Bert(config)

    #  ------------------- PyTorch -------------------------
    torch_model = BertForNextSentencePrediction(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        num_hidden_layers=config.num_layers,
                        num_attention_heads=config.attention_heads,
                        intermediate_size=config.ff_size,
                        hidden_act=config.activation_type,
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps,
                        mask_tokens=config.mask_tokens,
                        num_labels=2))

    fwd_graph(popart_model, torch_model, NSP_MAPPING, transform=NSP_TRANSFORM)

Beispiel #12

0

Datei anzeigen

Datei: nsp_test.py Projekt: samuelharmer/examples

def test_nsp_fwd(custom_ops):
    #  ------------------- PopART --------------------
    builder = popart.Builder(
        opsets={"ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1})
    config = BertConfig(task="NSP",
                        vocab_length=9728,
                        num_layers=2,
                        batch_size=1,
                        hidden_size=768,
                        sequence_length=128,
                        activation_type="relu",
                        popart_dtype="FLOAT",
                        no_dropout=True,
                        custom_ops=["gather", "attention"],
                        inference=True)
    popart_model = Bert(config, builder=builder)

    #  ------------------- PyTorch -------------------------
    torch_model = BertForNextSentencePrediction(
        TorchBertConfig(config.vocab_length, config.hidden_size,
                        num_hidden_layers=config.num_layers,
                        num_attention_heads=config.attention_heads,
                        intermediate_size=config.ff_size,
                        hidden_act=config.activation_type,
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps,
                        mask_tokens=config.mask_tokens,
                        num_labels=2))

    fwd_graph(popart_model,
              torch_model,
              mapping=NSP_MAPPING,
              transform=NSP_TRANSFORM)

Beispiel #13

0

Datei anzeigen

def test_pretraining_fwd(custom_ops):
    #  ------------------- PopART --------------------
    builder = popart.Builder(opsets={"ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1})
    config = BertConfig(task="PRETRAINING",
                        vocab_length=9728,
                        num_layers=2,
                        batch_size=1,
                        hidden_size=768,
                        sequence_length=128,
                        popart_dtype="FLOAT",
                        activation_type="relu",
                        no_dropout=True,
                        custom_ops=["gather", "attention"],
                        inference=True)
    popart_model = Bert(config, builder=builder)

    #  ------------------- PyTorch -------------------------
    torch_model = BertForMaskedLM(
        TorchBertConfig(config.vocab_length, config.hidden_size,
                        num_hidden_layers=config.num_layers,
                        num_attention_heads=config.attention_heads,
                        intermediate_size=config.ff_size,
                        hidden_act="relu",
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps,
                        mask_tokens=config.mask_tokens))

    fwd_graph(popart_model, torch_model, mapping=onnx_torch_mapping, transform=onnx_torch_tform)

Beispiel #14

0

Datei anzeigen

def popart_result_and_model(popart_config, is_bwd=False, momentum=0.0):
    popart_model = Bert(popart_config)

    input_info = popart.TensorInfo(popart_config.popart_dtype, [
        popart_config.micro_batch_size * popart_config.sequence_length,
        popart_config.hidden_size
    ])
    input_tensor = popart_model.builder.addInputTensor(input_info)

    data = {
        input_tensor:
        np.random.normal(0, 0.02,
                         input_info.shape()).astype(popart_config.dtype)
    }

    output = popart_model.feed_forward(input_tensor)

    if is_bwd:
        l1 = popart_model.builder.aiGraphcore.l1loss(
            [output],
            0.1,
            debugContext="l1LossVal",
            reduction=popart.ReductionType.Sum)
        proto = popart_model.builder.getModelProto()

        if momentum > 0.0:
            optimizer = popart.SGD({
                "defaultLearningRate": (lr, False),
                "defaultMomentum": (momentum, False),
                "defaultWeightDecay": (0.0, False)
            })
        else:
            optimizer = popart.ConstSGD(lr)

        outputs, post_proto = run_py(proto,
                                     data, (output, l1),
                                     loss=l1,
                                     optimizer=optimizer,
                                     num_reps=num_reps_bwd)
    else:
        proto = popart_model.builder.getModelProto()
        outputs, post_proto = run_py(proto, data, output)

    return data[input_tensor], outputs, proto, post_proto

Beispiel #15

0

Datei anzeigen

Datei: pretraining_test.py Projekt: graphcore/examples

def test_pretraining_bwd(custom_ops, opt_type):
    #  ------------------- PopART --------------------
    config = BertConfig(task="PRETRAINING",
                        encoder_start_ipu=1,
                        vocab_length=1024,
                        micro_batch_size=1,
                        hidden_size=64,
                        attention_heads=2,
                        sequence_length=20,
                        max_positional_length=20,
                        mask_tokens=2,
                        popart_dtype="FLOAT",
                        activation_type="relu",
                        no_dropout=True,
                        no_attn_dropout=True,
                        update_embedding_dict=True,
                        no_cls_layer=True,
                        no_mask=True,
                        split_qkv=(opt_type == "LAMB"))
    popart_model = Bert(config)

    #  ------------------- PyTorch -------------------------
    torch_model = BertForMaskedLM(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        num_hidden_layers=config.num_layers,
                        num_attention_heads=config.attention_heads,
                        intermediate_size=config.ff_size,
                        hidden_act="relu",
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps,
                        update_embedding_dict=True,
                        mask_tokens=config.mask_tokens))

    l1_lambda = 0.1

    def popart_loss_fn(logits):
        loss = popart_model.builder.aiGraphcore.l1loss(
            [logits[0]],
            l1_lambda,
            debugContext="l1LossVal",
            reduction=popart.ReductionType.Sum)
        popart_model.builder.virtualGraph(loss,
                                          popart_model.mlm_scope.virtualGraph)
        return loss

    bwd_graph(
        popart_model,
        torch_model,
        popart_loss_fn=popart_loss_fn,
        torch_loss_fn=lambda logits: l1_lambda * torch.norm(logits[0], 1),
        mapping={},
        transform=onnx_torch_tform,
        opt_type=opt_type)

Beispiel #16

0

Datei anzeigen

Datei: nsp_test.py Projekt: xerothermic/examples

def test_nsp_bwd(custom_ops):
    #  ------------------- PopART --------------------
    builder = popart.Builder(opsets={
        "ai.onnx": 9,
        "ai.onnx.ml": 1,
        "ai.graphcore": 1
    })
    config = BertConfig(task="NSP",
                        vocab_length=9728,
                        num_layers=1,
                        batch_size=1,
                        hidden_size=768,
                        sequence_length=128,
                        activation_type="relu",
                        popart_dtype="FLOAT",
                        no_dropout=True,
                        custom_ops=["gather", "attention"])
    popart_model = Bert(config, builder=builder)

    #  ------------------- PyTorch -------------------------
    torch_model = BertForNextSentencePrediction(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        num_hidden_layers=config.num_layers,
                        num_attention_heads=config.attention_heads,
                        intermediate_size=config.ff_size,
                        hidden_act="relu",
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps,
                        mask_tokens=config.mask_tokens,
                        num_labels=2))

    def popart_loss_fn(outputs):
        loss = popart.L1Loss(outputs[0], "l1Loss", 0.1)
        loss.virtualGraph(popart_model.nsp_scope.virtualGraph)
        return [loss]

    def torch_loss_fn(outputs):
        return 0.1 * torch.norm(outputs[0], 1)

    bwd_graph(popart_model,
              torch_model,
              popart_loss_fn=popart_loss_fn,
              torch_loss_fn=torch_loss_fn,
              mapping={
                  "bert.pooler.dense.weight": "NSP/PoolW",
                  "bert.pooler.dense.bias": "NSP/PoolB",
                  "cls.seq_relationship.weight": "NSP/NspW",
                  "cls.seq_relationship.bias": "NSP/NspB"
              },
              transform={
                  "bert.pooler.dense.weight": np.transpose,
                  "cls.seq_relationship.weight": np.transpose
              })

Beispiel #17

0

Datei anzeigen

Datei: weight_decay_test.py Projekt: graphcore/examples

def popart_result_and_model(popart_config,
                            weight_decay=0.0,
                            lr=0.0,
                            l1_lambda=0.0):
    popart_model = Bert(popart_config)
    builder = popart_model.builder

    input_info = popart.TensorInfo(popart_config.popart_dtype, [
        popart_config.micro_batch_size * popart_config.sequence_length,
        popart_config.hidden_size
    ])
    input_tensor = builder.addInputTensor(input_info)

    data = {
        input_tensor:
        np.random.normal(0, 0.02,
                         input_info.shape()).astype(popart_config.dtype)
    }

    output = popart_model.feed_forward(input_tensor)

    l1 = builder.aiGraphcore.l1loss([output],
                                    l1_lambda,
                                    debugContext="l1LossVal",
                                    reduction=popart.ReductionType.Sum)
    proto = builder.getModelProto()

    iteration = MockIteration()
    args = MockArgs("SGD", lr, weight_decay)
    optimizer_factory = BaseOptimizerFactory(args, iteration,
                                             popart_model.tensors)
    optimizer = optimizer_factory.create()

    outputs, post_proto = run_py(proto,
                                 data, (output, l1),
                                 loss=l1,
                                 optimizer=optimizer)

    return data[input_tensor], outputs, proto, post_proto

Beispiel #18

0

Datei anzeigen

def test_nsp_bwd(custom_ops, opt_type):
    #  ------------------- PopART --------------------
    config = BertConfig(task="NSP",
                        vocab_length=2432,
                        micro_batch_size=1,
                        hidden_size=288,
                        sequence_length=128,
                        activation_type="relu",
                        popart_dtype="FLOAT",
                        no_dropout=True,
                        no_attn_dropout=True,
                        no_mask=True,
                        update_embedding_dict=True,
                        split_qkv=(opt_type == "LAMB"))
    popart_model = Bert(config)

    #  ------------------- PyTorch -------------------------
    torch_model = BertForNextSentencePrediction(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        num_hidden_layers=config.num_layers,
                        num_attention_heads=config.attention_heads,
                        intermediate_size=config.ff_size,
                        hidden_act=config.activation_type,
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps,
                        mask_tokens=config.mask_tokens,
                        update_embedding_dict=True,
                        num_labels=2))
    l1_lambda = 0.1

    def popart_loss_fn(outputs):
        loss = popart_model.builder.aiGraphcore.l1loss(
            [outputs[0]],
            l1_lambda,
            debugContext="l1LossVal",
            reduction=popart.ReductionType.Sum)
        popart_model.builder.virtualGraph(loss,
                                          popart_model.nsp_scope.virtualGraph)
        return loss

    def torch_loss_fn(outputs):
        return l1_lambda * torch.norm(outputs[0], 1)

    bwd_graph(popart_model,
              torch_model,
              popart_loss_fn=popart_loss_fn,
              torch_loss_fn=torch_loss_fn,
              mapping=NSP_MAPPING,
              transform=NSP_TRANSFORM,
              opt_type=opt_type)

Beispiel #19

0

Datei anzeigen

def test_pretraining_bwd(custom_ops):
    #  ------------------- PopART --------------------
    builder = popart.Builder(opsets={
        "ai.onnx": 9,
        "ai.onnx.ml": 1,
        "ai.graphcore": 1
    })
    config = BertConfig(task="PRETRAINING",
                        vocab_length=9728,
                        projection_serialization_steps=4,
                        num_layers=1,
                        batch_size=1,
                        hidden_size=768,
                        sequence_length=128,
                        popart_dtype="FLOAT",
                        activation_type="relu",
                        no_dropout=True,
                        update_embedding_dict=False)
    popart_model = Bert(config, builder=builder)

    #  ------------------- PyTorch -------------------------
    torch_model = BertForMaskedLM(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        num_hidden_layers=config.num_layers,
                        num_attention_heads=config.attention_heads,
                        intermediate_size=config.ff_size,
                        hidden_act="relu",
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps,
                        mask_tokens=config.mask_tokens))

    l1_lambda = 0.1

    def popart_loss_fn(logits):
        loss = builder.aiGraphcore.l1loss([logits[0]],
                                          l1_lambda,
                                          debugPrefix="l1LossVal",
                                          reduction=popart.ReductionType.Sum)
        builder.virtualGraph(loss, popart_model.mlm_scope.virtualGraph)
        return loss

    bwd_graph(
        popart_model,
        torch_model,
        popart_loss_fn=popart_loss_fn,
        torch_loss_fn=lambda logits: l1_lambda * torch.norm(logits[0], 1),
        mapping=onnx_torch_mapping,
        transform=onnx_torch_tform)

Beispiel #20

0

Datei anzeigen

Datei: nsp_test.py Projekt: stuartc-graphcore/examples

def test_nsp_bwd(custom_ops):
    #  ------------------- PopART --------------------
    builder = popart.Builder(opsets={
        "ai.onnx": 9,
        "ai.onnx.ml": 1,
        "ai.graphcore": 1
    })
    config = BertConfig(task="NSP",
                        vocab_length=9728,
                        num_layers=1,
                        batch_size=1,
                        hidden_size=768,
                        sequence_length=128,
                        activation_type="relu",
                        popart_dtype="FLOAT",
                        no_dropout=True)
    popart_model = Bert(config, builder=builder)

    #  ------------------- PyTorch -------------------------
    torch_model = BertForNextSentencePrediction(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        num_hidden_layers=config.num_layers,
                        num_attention_heads=config.attention_heads,
                        intermediate_size=config.ff_size,
                        hidden_act=config.activation_type,
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps,
                        mask_tokens=config.mask_tokens,
                        num_labels=2))

    def popart_loss_fn(outputs):
        loss = builder.aiGraphcore.l1loss([outputs[0]],
                                          0.1,
                                          debugPrefix="l1Loss",
                                          reduction=popart.ReductionType.Sum)
        builder.virtualGraph(loss, popart_model.nsp_scope.virtualGraph)
        return loss

    def torch_loss_fn(outputs):
        return 0.1 * torch.norm(outputs[0], 1)

    bwd_graph(popart_model,
              torch_model,
              popart_loss_fn=popart_loss_fn,
              torch_loss_fn=torch_loss_fn,
              mapping=NSP_MAPPING,
              transform=NSP_TRANSFORM)

Beispiel #21

0

Datei anzeigen

Datei: squad_test.py Projekt: sabarahimi2019/examples

def test_squad_fwd():
    #  ------------------- PopART --------------------
    builder = popart.Builder(opsets={
        "ai.onnx": 9,
        "ai.onnx.ml": 1,
        "ai.graphcore": 1
    })
    config = BertConfig(task="SQUAD",
                        vocab_length=9728,
                        num_layers=2,
                        batch_size=1,
                        hidden_size=768,
                        sequence_length=128,
                        activation_type="relu",
                        popart_dtype="FLOAT",
                        no_dropout=True,
                        custom_ops=[],
                        inference=True)
    popart_model = Bert(config, builder=builder)

    #  ------------------- PyTorch -------------------------
    torch_model = BertForQuestionAnswering(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        num_hidden_layers=config.num_layers,
                        num_attention_heads=config.attention_heads,
                        intermediate_size=config.ff_size,
                        hidden_act="relu",
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps,
                        mask_tokens=config.mask_tokens,
                        num_labels=2))

    fwd_graph(popart_model,
              torch_model,
              mapping={
                  "cls.transform.dense.weight": "CLS/LMPredictionW",
                  "cls.transform.dense.bias": "CLS/LMPredictionB",
                  "cls.transform.LayerNorm.weight": "CLS/Gamma",
                  "cls.transform.LayerNorm.bias": "CLS/Beta",
                  "qa_outputs.weight": "Squad/SquadW",
                  "qa_outputs.bias": "Squad/SquadB"
              },
              transform={
                  "cls.transform.dense.weight": np.transpose,
                  "qa_outputs.weight": np.transpose
              })

Beispiel #22

0

Datei anzeigen

Datei: pretraining_test.py Projekt: graphcore/examples

def test_pretraining_fwd(custom_ops):
    #  ------------------- PopART --------------------
    config = BertConfig(task="PRETRAINING",
                        encoder_start_ipu=1,
                        vocab_length=1024,
                        micro_batch_size=1,
                        hidden_size=64,
                        attention_heads=2,
                        sequence_length=20,
                        max_positional_length=20,
                        mask_tokens=2,
                        popart_dtype="FLOAT",
                        activation_type="relu",
                        no_dropout=True,
                        no_attn_dropout=True,
                        no_cls_layer=False,
                        inference=True,
                        no_mask=True,
                        split_qkv=False)

    popart_model = Bert(config)

    #  ------------------- PyTorch -------------------------
    torch_model = BertForMaskedLM(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        num_hidden_layers=config.num_layers,
                        num_attention_heads=config.attention_heads,
                        intermediate_size=config.ff_size,
                        hidden_act="relu",
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps,
                        mask_tokens=config.mask_tokens,
                        no_cls_layer=config.no_cls_layer))

    fwd_graph(popart_model,
              torch_model,
              mapping=ONNX_TORCH_MAPPING,
              transform=onnx_torch_tform)

Beispiel #23

0

Datei anzeigen

Datei: squad_test.py Projekt: graphcore/examples

def test_squad_fwd(custom_ops):
    #  ------------------- PopART --------------------
    config = BertConfig(task="SQUAD",
                        encoder_start_ipu=1,
                        vocab_length=1024,
                        micro_batch_size=1,
                        hidden_size=64,
                        attention_heads=2,
                        sequence_length=20,
                        max_positional_length=20,
                        activation_type="relu",
                        popart_dtype="FLOAT",
                        no_dropout=True,
                        no_attn_dropout=True,
                        inference=True,
                        no_mask=True,
                        split_qkv=False,
                        squad_single_output=False)

    popart_model = Bert(config)

    #  ------------------- PyTorch -------------------------
    torch_model = BertForQuestionAnswering(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        num_hidden_layers=config.num_layers,
                        num_attention_heads=config.attention_heads,
                        intermediate_size=config.ff_size,
                        hidden_act="relu",
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps,
                        mask_tokens=2,
                        num_labels=2))

    fwd_graph(popart_model,
              torch_model,
              mapping=ONNX_TORCH_MAPPING,
              transform={"qa_outputs.weight": np.transpose})

Beispiel #24

0

Datei anzeigen

def pooled_validation_run(bert_args,
                          config,
                          initializers,
                          checkpoint_paths,
                          num_processes=1,
                          available_ipus=16):
    logger.info("Building Model")
    model = Bert(config,
                 builder=popart.Builder(
                     opsets={"ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1}),
                 initializers=initializers)

    indices, positions, segments, masks, labels = bert_add_inputs(
        bert_args, model)
    logits = bert_logits_graph(model, indices, positions, segments, masks, bert_args.execution_mode)
    inputs = [indices, positions, segments, *masks]
    outputs = bert_add_logit_outputs(model, logits)

    with tempfile.TemporaryDirectory() as temp_results_path:
        # Inject the checkpoint-specific squad results directory into the dataset args otherwise
        # they overwrite each other when multithreaded
        bert_args.squad_results_dir = temp_results_path

        dataset = get_bert_dataset(
            model, bert_args, [indices, positions, segments, masks, labels])
        logger.info(f"Dataset length: {len(dataset)}")

        data_flow = popart.DataFlow(dataset.batches_per_step, outputs)

        iteration = Iteration(
            bert_args,
            batches_per_step=dataset.batches_per_step,
            steps_per_epoch=len(dataset),
            writer=None,
            recording_steps=bert_args.aggregate_metrics_over_steps)

        request_ipus, _ = calc_required_ipus(bert_args, model)

        if request_ipus * num_processes > available_ipus:
            raise ValueError(
                "Cannot run with requested number of processes - too many IPUs required")

        device = acquire_device(bert_args, request_ipus)

        session, anchors = bert_inference_session(
            model, bert_args, data_flow, device)

        model_results = recursive_defaultdict()
        for path in checkpoint_paths:
            session.resetHostWeights(str(path.absolute()))
            session.weightsFromHost()

            logger.info(f"Inference started for checkpoint: {path.absolute()}")
            result = run_inference_extract_result(bert_args,
                                                  session,
                                                  dataset,
                                                  inputs,
                                                  logits,
                                                  anchors,
                                                  iteration)

            result_into_recursive_path(model_results, path, bert_args.checkpoint_dir, result)

        device.detach()
    return model_results

Beispiel #25

0

Datei anzeigen

def test_embedding_fwd(custom_ops):
    #  ------------------- PopART --------------------
    config = BertConfig(task="SQUAD",
                        vocab_length=9728,
                        micro_batch_size=1,
                        hidden_size=768,
                        sequence_length=128,
                        activation_type='relu',
                        popart_dtype="FLOAT",
                        no_dropout=True,
                        inference=True)
    popart_model = Bert(config)

    sequence_info = popart.TensorInfo(
        "UINT32", [config.micro_batch_size * config.sequence_length])
    indices = popart_model.builder.addInputTensor(sequence_info)
    positions = popart_model.builder.addInputTensor(sequence_info)
    segments = popart_model.builder.addInputTensor(sequence_info)
    data = {
        indices:
        np.random.randint(
            0, config.vocab_length,
            (config.micro_batch_size * config.sequence_length)).astype(
                np.uint32),
        positions:
        np.random.randint(
            0, config.max_positional_length,
            (config.micro_batch_size * config.sequence_length)).astype(
                np.uint32),
        segments:
        np.random.randint(
            0, 2, (config.micro_batch_size * config.sequence_length)).astype(
                np.uint32)
    }

    user_options = {"enableStochasticRounding": True}
    with popart_model.builder.nameScope("Embedding"):
        output = popart_model.embedding(indices, positions, segments)

    proto = popart_model.builder.getModelProto()
    outputs, post_proto = run_py(proto,
                                 data,
                                 output,
                                 user_options=user_options)

    # ----------------- PopART -> PyTorch ----------------
    proto = onnx.load_model_from_string(proto)

    inputs = [
        data[t].reshape(config.micro_batch_size,
                        config.sequence_length).astype(np.int32)
        for t in [indices, positions, segments]
    ]

    #  ------------------- PyTorch -------------------------
    torch_model = BertEmbeddings(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps))
    torch_model.eval()

    copy_weights_to_torch(torch_model, proto, TORCH_TO_ONNX, {})
    torch_outputs = run_fwd_model(inputs, torch_model)

    check_tensors(torch_outputs, outputs, margin=5e-7)

Beispiel #26

0

Datei anzeigen

def test_embedding_bwd(custom_ops):
    #  ------------------- PopART --------------------
    config = BertConfig(task="SQUAD",
                        vocab_length=9728,
                        micro_batch_size=1,
                        hidden_size=768,
                        sequence_length=128,
                        activation_type='relu',
                        popart_dtype="FLOAT",
                        no_dropout=True,
                        update_embedding_dict=True)

    popart_model = Bert(config)
    # Prevent virtualGraph attributes being added to the ops

    sequence_info = popart.TensorInfo(
        "UINT32", [config.micro_batch_size * config.sequence_length])
    indices = popart_model.builder.addInputTensor(sequence_info)
    positions = popart_model.builder.addInputTensor(sequence_info)
    segments = popart_model.builder.addInputTensor(sequence_info)
    data = {
        indices:
        np.random.randint(
            0, config.vocab_length,
            (config.micro_batch_size * config.sequence_length)).astype(
                np.uint32),
        positions:
        np.random.randint(
            0, config.max_positional_length,
            (config.micro_batch_size * config.sequence_length)).astype(
                np.uint32),
        segments:
        np.random.randint(
            0, 2, (config.micro_batch_size * config.sequence_length)).astype(
                np.uint32)
    }

    optimizer = popart.ConstSGD(0.01)

    l1_lambda = 0.1
    with popart_model.builder.nameScope("Embedding"):
        output = popart_model.embedding(indices, positions, segments)
    l1 = popart_model.builder.aiGraphcore.l1loss(
        [output],
        l1_lambda,
        debugContext="l1LossVal",
        reduction=popart.ReductionType.Sum)

    num_reps = 5
    proto = popart_model.builder.getModelProto()
    outputs, post_proto = run_py(proto,
                                 data,
                                 output,
                                 ipus=1,
                                 loss=l1,
                                 num_reps=num_reps,
                                 optimizer=optimizer)

    # ----------------- PopART -> PyTorch ----------------
    proto = onnx.load_model_from_string(proto)

    inputs = [
        data[t].reshape(config.micro_batch_size,
                        config.sequence_length).astype(np.int32)
        for t in [indices, positions, segments]
    ]

    #  ------------------- PyTorch -------------------------

    torch_model = BertEmbeddings(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps,
                        update_embedding_dict=config.update_embedding_dict))
    # Turn off dropout
    torch_model.eval()

    copy_weights_to_torch(torch_model, proto, TORCH_TO_ONNX, {})

    optim = torch.optim.SGD(torch_model.parameters(), 0.01)
    for _ in range(num_reps):
        torch_output = torch_model(
            *[torch.from_numpy(t).long() for t in inputs])
        torch_loss = l1_lambda * torch.norm(torch_output, 1)
        torch_loss.backward()
        optim.step()
        optim.zero_grad()

    torch_outputs = [torch_output.detach().numpy()]

    check_tensors(torch_outputs, outputs, margin=7e-6)

    check_model(torch_model, post_proto, TORCH_TO_ONNX, {}, margin=7e-06)

Beispiel #27

0

Datei anzeigen

Datei: bert.py Projekt: xerothermic/examples

def main(args):
    set_library_seeds(args.seed)

    config = bert_config_from_args(args)

    initializers = bert_pretrained_initialisers(config, args)

    logger.info("Building Model")
    # Specifying ai.onnx opset9 for the slice syntax
    # TODO: Change slice to opset10
    model = Bert(config,
                 builder=popart.Builder(
                     opsets={"ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1}),
                 initializers=initializers)

    indices, positions, segments, masks, labels = bert_add_inputs(args, model)
    logits = bert_logits_graph(model, indices, positions, segments, masks)

    if args.inference:
        outputs = bert_add_infer_outputs(model, logits)
        losses = []
        writer = None
    else:
        predictions, probs = bert_infer_graph(model, logits)
        losses = bert_loss_graph(model, probs, labels)
        outputs = bert_add_validation_outputs(model, predictions, losses)
        writer = bert_writer(args)

    dataset = get_bert_dataset(model, args, [indices, positions, segments, masks, labels])
    logger.info(f"Dataset length: {len(dataset)}")

    data_flow = popart.DataFlow(dataset.batches_per_step, outputs)

    iteration = Iteration(
        args,
        batches_per_step=dataset.batches_per_step,
        steps_per_epoch=len(dataset),
        writer=writer,
        recording_steps=args.aggregate_metrics_over_steps)

    request_ipus, required_ipus = calc_required_ipus(args, model)

    device = acquire_device(args, request_ipus)

    if args.inference:
        session, anchors = bert_inference_session(model, args, data_flow, losses, device)
        logger.info("Inference Started")
        bert_infer_loop(args, session,
                        dataset, logits, anchors,
                        iteration)
        device.detach()
    else:
        if not args.no_training:
            optimizer_factory = ScheduledOptimizerFactory(args,
                                                          iteration,
                                                          model.pipeline_stage_tensors)

            session, anchors = bert_training_session(model,
                                                     args,
                                                     data_flow,
                                                     losses,
                                                     device,
                                                     optimizer_factory)
            logger.info("Training Started")
            bert_train_loop(args, session, writer,
                            dataset, labels, predictions, losses, anchors,
                            iteration, optimizer_factory)

            device.detach()
            logger.info("Training Finished")
        if not args.no_validation:
            logger.info("Doing Validation")
            main(utils.get_validation_args(args))

    return session, iteration

Beispiel #28

0

Datei anzeigen

def main(args):
    set_library_seeds(args.seed)

    config = bert_config_from_args(args)

    initializers = bert_pretrained_initialisers(config, args)

    logger.info("Building Model")
    # Specifying ai.onnx opset9 for the slice syntax
    model = Bert(config,
                 builder=popart.Builder(opsets={
                     "ai.onnx": 9,
                     "ai.onnx.ml": 1,
                     "ai.graphcore": 1
                 }),
                 initializers=initializers,
                 execution_mode=args.execution_mode)

    # If config.host_embedding is enabled, indices and positions will have the matrices instead of the index vector.
    indices, positions, segments, masks, labels = bert_add_inputs(args, model)
    logits = bert_logits_graph(model, indices, positions, segments, masks)

    if args.inference:

        predictions = None
        losses = []
        if args.task == "PRETRAINING":
            # If this is a pretraining session, labels for NSP and MLM are already within the dataset,
            # so we can always calculate prediction performance
            predictions, _ = bert_infer_graph(model,
                                              logits,
                                              include_probs=False)

            if args.inference_lm_perplexity:
                losses = bert_perplexity_graph(model, logits, labels)

            outputs = bert_add_validation_outputs(model, predictions, losses)
        else:
            if args.inference_lm_perplexity:
                raise RuntimeError(
                    "Masked LM perplexity is only supported in pretraining.")

            outputs = bert_add_logit_outputs(model, logits)

        writer = None
    else:
        predictions, probs = bert_infer_graph(model, logits)
        losses = bert_loss_graph(model, probs, labels)
        outputs = bert_add_validation_outputs(model, predictions, losses)
        writer = bert_writer(args)

    embedding_dict, positional_dict = model.get_model_embeddings()

    dataset = get_bert_dataset(model, args,
                               [indices, positions, segments, masks, labels],
                               embedding_dict, positional_dict,
                               config.host_embedding == "MERGE")
    logger.info(f"Dataset length: {len(dataset)}")

    data_flow = popart.DataFlow(dataset.batches_per_step, outputs)

    iteration = Iteration(args,
                          batches_per_step=dataset.batches_per_step,
                          steps_per_epoch=len(dataset),
                          writer=writer,
                          recording_steps=args.aggregate_metrics_over_steps)

    request_ipus, required_ipus = calc_required_ipus(args, model)

    device = acquire_device(args, request_ipus)

    if args.inference:
        session, anchors = bert_inference_session(model, args, data_flow,
                                                  device)
        logger.info("Inference Started")
        inputs = [indices, positions, segments, *masks, *labels]
        bert_infer_loop(args, session, dataset, inputs, logits, anchors,
                        labels, predictions, losses, iteration)
        device.detach()
    else:
        if not args.no_training:
            optimizer_factory = ScheduledOptimizerFactory(
                args, iteration, model.tensors)

            session, anchors = bert_training_session(model, args, data_flow,
                                                     losses, device,
                                                     optimizer_factory)
            logger.info("Training Started")
            bert_train_loop(args, session, writer, dataset, labels,
                            predictions, losses, anchors, iteration,
                            optimizer_factory)

            device.detach()
            logger.info("Training Finished")

    return session, iteration

Beispiel #29

0

Datei anzeigen

def run_embedding_layer(args):
    set_library_seeds(args.seed)

    config = bert_config_from_args(args)

    initializers = bert_pretrained_initialisers(config, args)

    logger.info("Building Model")
    # Specifying ai.onnx opset9 for the slice syntax
    # TODO: Change slice to opset10
    model = Bert(config,
                 builder=popart.Builder(opsets={
                     "ai.onnx": 9,
                     "ai.onnx.ml": 1,
                     "ai.graphcore": 1
                 }),
                 initializers=initializers,
                 execution_mode=args.execution_mode)

    # If config.host_embedding is enabled, indices and positions will have the matrices instead of the index vector.
    indices, positions, segments, masks, labels = bert_add_inputs(args, model)
    logits = tuple([model.embedding(indices, positions, segments)])

    if args.inference:
        outputs = bert_add_logit_outputs(model, logits)
        writer = None

        dataset = get_bert_dataset(
            model, args, [indices, positions, segments, masks, labels])

        data_flow = popart.DataFlow(dataset.batches_per_step, outputs)

        iteration = Iteration(
            args,
            steps_per_epoch=len(dataset),
            writer=writer,
            recording_steps=args.aggregate_metrics_over_steps)

        request_ipus = bert_required_ipus(args, model)

        device = acquire_device(args, request_ipus)

        session, anchors = bert_inference_session(model, args, data_flow,
                                                  device)
        logger.info("Inference Started")
        inputs = [indices, positions, segments, *masks]
        """bert_infer_loop(args, session,
                        dataset, inputs, logits, anchors,
                        iteration)"""
        save_results = args.task == "SQUAD" and not (args.synthetic_data
                                                     or args.generated_data)

        start_times = defaultdict(list)
        end_times = defaultdict(list)
        # Create the stepio once outside of the inference loop:
        static_data = {}
        if args.low_latency_inference and args.task == "SQUAD":
            stepio = create_callback_stepio(static_data, anchors, start_times,
                                            end_times,
                                            dataset.batches_per_step)
        else:
            stepio = None

        output = []
        logger.info(dataset)
        for data in dataset:
            static_data.update({t: data[t] for t in inputs})
            result = bert_process_infer_data(args, session, static_data,
                                             anchors, logits, iteration,
                                             start_times, end_times, stepio)
            if save_results:
                output.append(result)
            break

        device.detach()
        return output

    return None

Beispiel #30

0

Datei anzeigen

def test_embedding_fwd(custom_ops):
    #  ------------------- PopART --------------------
    builder = popart.Builder(opsets={
        "ai.onnx": 9,
        "ai.onnx.ml": 1,
        "ai.graphcore": 1
    })
    config = BertConfig(vocab_length=9728,
                        batch_size=1,
                        hidden_size=768,
                        sequence_length=128,
                        activation_type='relu',
                        popart_dtype="FLOAT",
                        no_dropout=True,
                        custom_ops=['gather'],
                        inference=True)
    popart_model = Bert(config, builder=builder)
    # Prevent virtualGraph attributes being added to the ops.
    popart_model.embedding_scope = popart_model.device_scope(None, None)
    popart_model.embedding_split_scope = popart_model.embedding_scope

    sequence_info = popart.TensorInfo(
        "UINT32", [config.batch_size * config.sequence_length])
    indices = builder.addInputTensor(sequence_info)
    positions = builder.addInputTensor(sequence_info)
    segments = builder.addInputTensor(sequence_info)
    data = {
        indices:
        np.random.randint(0, config.vocab_length,
                          (config.batch_size * config.sequence_length)).astype(
                              np.uint32),
        positions:
        np.random.randint(0, config.max_positional_length,
                          (config.batch_size * config.sequence_length)).astype(
                              np.uint32),
        segments:
        np.random.randint(0, 2,
                          (config.batch_size * config.sequence_length)).astype(
                              np.uint32)
    }

    # Use the custom embedding for layout
    output = popart_model.embedding(indices, positions, segments)

    proto = builder.getModelProto()

    outputs, post_proto = run_py(
        proto, data, output, user_options={"enableStochasticRounding": True})

    # ----------------- PopART -> PyTorch ----------------
    proto = onnx.load_model_from_string(proto)

    inputs = [
        data[t].reshape(config.batch_size,
                        config.sequence_length).astype(np.int32)
        for t in [indices, positions, segments]
    ]

    torch_to_onnx = {
        "word_embeddings.weight": "Embedding_Dict",
        "position_embeddings.weight": "Positional_Dict",
        "token_type_embeddings.weight": "Segment_Dict",
        "LayerNorm.weight": "Gamma",
        "LayerNorm.bias": "Beta"
    }

    transposed_weights = {
        "word_embeddings.weight": np.transpose,
        "position_embeddings.weight": np.transpose,
    }

    #  ------------------- PyTorch -------------------------
    torch_model = BertEmbeddings(
        TorchBertConfig(config.vocab_length,
                        config.hidden_size,
                        max_position_embeddings=config.max_positional_length,
                        layer_norm_eps=config.layer_norm_eps))
    torch_model.eval()

    copy_weights_to_torch(torch_model, proto, torch_to_onnx,
                          transposed_weights)

    torch_outputs = run_fwd_model(inputs, torch_model)

    check_tensors(torch_outputs, outputs)