Beispiel #1
0
tokenizer.add_special_tokens(["[MASK]", "[CLS]", "[SEP]"])
vocab_size = 8 * math.ceil(tokenizer.vocab_size / 8)

bert_model = nemo_nlp.huggingface.BERT(vocab_size=vocab_size,
                                       num_layers=args.num_layers,
                                       d_model=args.d_model,
                                       num_heads=args.num_heads,
                                       d_inner=args.d_inner,
                                       max_seq_length=args.max_sequence_length,
                                       hidden_act="gelu",
                                       factory=neural_factory)

# instantiate necessary modules for the whole translation pipeline, namely
# data layers, BERT encoder, and MLM and NSP loss functions
mlm_log_softmax = nemo_nlp.TransformerLogSoftmaxNM(vocab_size=vocab_size,
                                                   d_model=args.d_model,
                                                   factory=neural_factory)
mlm_loss = nemo_nlp.MaskedLanguageModelingLossNM(factory=neural_factory)

nsp_log_softmax = nemo_nlp.SentenceClassificationLogSoftmaxNM(
    d_model=args.d_model, num_classes=2, factory=neural_factory)
nsp_loss = nemo_nlp.NextSentencePredictionLossNM(factory=neural_factory)

bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2, factory=neural_factory)

# tie weights of MLM softmax layer and embedding layer of the encoder
mlm_log_softmax.log_softmax.dense.weight = \
    bert_model.bert.embeddings.word_embeddings.weight

train_data_layer = nemo_nlp.BertPretrainingDataLayer(
    tokenizer=tokenizer,
Beispiel #2
0
decoder = nemo_nlp.TransformerDecoderNM(
    factory=neural_factory,
    d_embedding=args.d_embedding,
    d_model=args.d_model,
    d_inner=args.d_inner,
    num_layers=args.num_layers,
    embedding_dropout=args.embedding_dropout,
    num_attn_heads=args.num_attn_heads,
    ffn_dropout=args.ffn_dropout,
    vocab_size=vocab_size,
    attn_score_dropout=args.attn_score_dropout,
    attn_layer_dropout=args.attn_layer_dropout,
    max_seq_length=args.max_sequence_length,
    share_all_layers=args.share_decoder_layers)
log_softmax = nemo_nlp.TransformerLogSoftmaxNM(factory=neural_factory,
                                               vocab_size=vocab_size,
                                               d_model=args.d_model,
                                               d_embedding=args.d_embedding)
beam_search = nemo_nlp.BeamSearchTranslatorNM(
    factory=neural_factory,
    decoder=decoder,
    log_softmax=log_softmax,
    max_seq_length=args.max_sequence_length,
    beam_size=args.beam_size,
    bos_token=tokenizer.bos_id(),
    pad_token=tokenizer.pad_id(),
    eos_token=tokenizer.eos_id())
loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(
    factory=neural_factory,
    pad_id=tokenizer.pad_id(),
    label_smoothing=args.label_smoothing)
Beispiel #3
0
encoder = nemo_nlp.TransformerEncoderNM(
    d_model=args.d_model,
    d_inner=args.d_inner,
    num_layers=args.num_layers,
    embedding_dropout=args.embedding_dropout,
    num_attn_heads=args.num_attn_heads,
    ffn_dropout=args.ffn_dropout,
    vocab_size=vocab_size,
    mask_future=True,
    attn_score_dropout=args.attn_score_dropout,
    attn_layer_dropout=args.attn_layer_dropout,
    max_seq_length=args.max_sequence_length)

log_softmax = nemo_nlp.TransformerLogSoftmaxNM(
    vocab_size=vocab_size,
    d_model=args.d_model)

loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(
    pad_id=tokenizer.pad_id(),
    label_smoothing=args.label_smoothing)

# tie weight of embedding and log_softmax layers
log_softmax.log_softmax.dense.weight = \
    encoder.embedding_layer.token_embedding.weight


def create_pipeline(dataset, batch_size):
    data_layer = nemo_nlp.LanguageModelingDataLayer(dataset,
                                                    batch_size=batch_size)
    src, src_mask, labels = data_layer()