t_log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True) beam_translator = nemo_nlp.BeamSearchTranslatorNM( decoder=decoder, log_softmax=t_log_softmax, max_seq_length=args.max_seq_length, beam_size=args.beam_size, length_penalty=args.len_pen, bos_token=tokenizer.bos_id(), pad_token=tokenizer.pad_id(), eos_token=tokenizer.eos_id()) loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=0, smoothing=0.1) loss_eval = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=0, smoothing=0.0) # tie all embeddings weights t_log_softmax.mlp.last_linear_layer.weight = \ encoder.bert.embeddings.word_embeddings.weight decoder.embedding_layer.token_embedding.weight = \ encoder.bert.embeddings.word_embeddings.weight decoder.embedding_layer.position_embedding.weight = \ encoder.bert.embeddings.position_embeddings.weight # training pipeline src, src_mask, tgt, tgt_mask, labels, sent_ids = train_data_layer() input_type_ids = zeros_transform(input_type_ids=src)
log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=tgt_tokenizer.vocab_size, num_layers=1, log_softmax=True) beam_search = nemo_nlp.BeamSearchTranslatorNM( decoder=decoder, log_softmax=log_softmax, max_seq_length=args.max_seq_length, beam_size=args.beam_size, bos_token=tgt_tokenizer.bos_id(), pad_token=tgt_tokenizer.pad_id(), eos_token=tgt_tokenizer.eos_id()) loss_fn = nemo_nlp.PaddedSmoothedCrossEntropyLossNM( pad_id=tgt_tokenizer.pad_id(), label_smoothing=args.label_smoothing) if tie_weight: log_softmax.mlp.last_linear_layer.weight = \ encoder.embedding_layer.token_embedding.weight decoder.embedding_layer.token_embedding.weight = \ encoder.embedding_layer.token_embedding.weight def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, training=True): data_layer = nemo_nlp.TranslationDataLayer(tokenizer_src=src_tokenizer, tokenizer_tgt=tgt_tokenizer,
share_all_layers=args.share_decoder_layers) log_softmax = nemo_nlp.TransformerLogSoftmaxNM(factory=neural_factory, vocab_size=vocab_size, d_model=args.d_model, d_embedding=args.d_embedding) beam_search = nemo_nlp.BeamSearchTranslatorNM( factory=neural_factory, decoder=decoder, log_softmax=log_softmax, max_seq_length=args.max_sequence_length, beam_size=args.beam_size, bos_token=tokenizer.bos_id(), pad_token=tokenizer.pad_id(), eos_token=tokenizer.eos_id()) loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM( factory=neural_factory, pad_id=tokenizer.pad_id(), label_smoothing=args.label_smoothing) # tie weight of embedding and log_softmax layers if args.tie_enc_dec: decoder.embedding_layer.token_embedding.weight = \ encoder.embedding_layer.token_embedding.weight if args.tie_projs: decoder.embedding_layer.token2hidden.weight = \ encoder.embedding_layer.token2hidden.weight if args.tie_enc_softmax: log_softmax.log_softmax.dense.weight = \ encoder.embedding_layer.token_embedding.weight if args.tie_projs: log_softmax.log_softmax.hidden2token.weight = \
d_model=args.d_model, d_embedding=args.d_embedding) beam_translator = nemo_nlp.BeamSearchTranslatorNM( factory=neural_factory, decoder=decoder, log_softmax=t_log_softmax, max_seq_length=max_sequence_length, beam_size=args.beam_size, length_penalty=args.len_pen, bos_token=tokenizer.bos_id(), pad_token=tokenizer.pad_id(), eos_token=tokenizer.eos_id()) loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(factory=neural_factory, pad_id=0, smoothing=0.1) loss_eval = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(factory=neural_factory, pad_id=0, smoothing=0.0) if args.encoder == "hf": # tie all embeddings weights t_log_softmax.log_softmax.dense.weight = \ encoder.bert.embeddings.word_embeddings.weight decoder.embedding_layer.token_embedding.weight = \ encoder.bert.embeddings.word_embeddings.weight decoder.embedding_layer.position_embedding.weight = \ encoder.bert.embeddings.position_embeddings.weight
vocab_size=vocab_size, attn_score_dropout=args.attn_score_dropout, attn_layer_dropout=args.attn_layer_dropout, max_seq_length=args.max_seq_length, embedding_dropout=args.embedding_dropout, learn_positional_encodings=True, hidden_act="gelu") decoder.restore_from(args.restore_from, local_rank=args.local_rank) t_log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True) loss_fn = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=tokenizer.pad_id(), smoothing=0.1) beam_search = nemo_nlp.BeamSearchTranslatorNM( decoder=decoder, log_softmax=t_log_softmax, max_seq_length=args.max_seq_length, beam_size=args.beam_size, length_penalty=args.len_pen, bos_token=tokenizer.bos_id(), pad_token=tokenizer.pad_id(), eos_token=tokenizer.eos_id()) # tie all embeddings weights t_log_softmax.mlp.layer0.weight = \ encoder.bert.embeddings.word_embeddings.weight decoder.embedding_layer.token_embedding.weight = \
d_inner=args.d_inner, num_layers=args.num_layers, embedding_dropout=args.embedding_dropout, num_attn_heads=args.num_attn_heads, ffn_dropout=args.ffn_dropout, vocab_size=vocab_size, mask_future=True, attn_score_dropout=args.attn_score_dropout, attn_layer_dropout=args.attn_layer_dropout, max_seq_length=args.max_sequence_length) log_softmax = nemo_nlp.TransformerLogSoftmaxNM(factory=neural_factory, vocab_size=vocab_size, d_model=args.d_model, d_embedding=args.d_embedding) loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM( factory=neural_factory, pad_id=tokenizer.pad_id(), label_smoothing=args.label_smoothing) loss_eval = nemo_nlp.PaddedSmoothedCrossEntropyLossNM( factory=neural_factory, pad_id=tokenizer.pad_id(), label_smoothing=args.label_smoothing, predict_last_k=args.predict_last_k) if args.tie_enc_softmax: log_softmax.log_softmax.dense.weight = \ encoder.embedding_layer.token_embedding.weight if args.tie_projs: log_softmax.log_softmax.hidden2token.weight = \ encoder.embedding_layer.token2hidden.weight # training pipeline