tokenizer.add_special_tokens(["[MASK]", "[CLS]", "[SEP]"]) vocab_size = 8 * math.ceil(tokenizer.vocab_size / 8) bert_model = nemo_nlp.huggingface.BERT(vocab_size=vocab_size, num_layers=args.num_layers, d_model=args.d_model, num_heads=args.num_heads, d_inner=args.d_inner, max_seq_length=args.max_sequence_length, hidden_act="gelu", factory=neural_factory) # instantiate necessary modules for the whole translation pipeline, namely # data layers, BERT encoder, and MLM and NSP loss functions mlm_log_softmax = nemo_nlp.TransformerLogSoftmaxNM(vocab_size=vocab_size, d_model=args.d_model, factory=neural_factory) mlm_loss = nemo_nlp.MaskedLanguageModelingLossNM(factory=neural_factory) nsp_log_softmax = nemo_nlp.SentenceClassificationLogSoftmaxNM( d_model=args.d_model, num_classes=2, factory=neural_factory) nsp_loss = nemo_nlp.NextSentencePredictionLossNM(factory=neural_factory) bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2, factory=neural_factory) # tie weights of MLM softmax layer and embedding layer of the encoder mlm_log_softmax.log_softmax.dense.weight = \ bert_model.bert.embeddings.word_embeddings.weight train_data_layer = nemo_nlp.BertPretrainingDataLayer( tokenizer=tokenizer,
decoder = nemo_nlp.TransformerDecoderNM( factory=neural_factory, d_embedding=args.d_embedding, d_model=args.d_model, d_inner=args.d_inner, num_layers=args.num_layers, embedding_dropout=args.embedding_dropout, num_attn_heads=args.num_attn_heads, ffn_dropout=args.ffn_dropout, vocab_size=vocab_size, attn_score_dropout=args.attn_score_dropout, attn_layer_dropout=args.attn_layer_dropout, max_seq_length=args.max_sequence_length, share_all_layers=args.share_decoder_layers) log_softmax = nemo_nlp.TransformerLogSoftmaxNM(factory=neural_factory, vocab_size=vocab_size, d_model=args.d_model, d_embedding=args.d_embedding) beam_search = nemo_nlp.BeamSearchTranslatorNM( factory=neural_factory, decoder=decoder, log_softmax=log_softmax, max_seq_length=args.max_sequence_length, beam_size=args.beam_size, bos_token=tokenizer.bos_id(), pad_token=tokenizer.pad_id(), eos_token=tokenizer.eos_id()) loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM( factory=neural_factory, pad_id=tokenizer.pad_id(), label_smoothing=args.label_smoothing)
encoder = nemo_nlp.TransformerEncoderNM( d_model=args.d_model, d_inner=args.d_inner, num_layers=args.num_layers, embedding_dropout=args.embedding_dropout, num_attn_heads=args.num_attn_heads, ffn_dropout=args.ffn_dropout, vocab_size=vocab_size, mask_future=True, attn_score_dropout=args.attn_score_dropout, attn_layer_dropout=args.attn_layer_dropout, max_seq_length=args.max_sequence_length) log_softmax = nemo_nlp.TransformerLogSoftmaxNM( vocab_size=vocab_size, d_model=args.d_model) loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM( pad_id=tokenizer.pad_id(), label_smoothing=args.label_smoothing) # tie weight of embedding and log_softmax layers log_softmax.log_softmax.dense.weight = \ encoder.embedding_layer.token_embedding.weight def create_pipeline(dataset, batch_size): data_layer = nemo_nlp.LanguageModelingDataLayer(dataset, batch_size=batch_size) src, src_mask, labels = data_layer()