def create_pipeline(dataset, tokens_in_batch, clean=False, training=True): dataset_src = os.path.join(args.data_dir, dataset + "." + args.src_lang) dataset_tgt = os.path.join(args.data_dir, dataset + "." + args.tgt_lang) data_layer = nemo_nlp.TranslationDataLayer(tokenizer_src=tokenizer, tokenizer_tgt=tokenizer, dataset_src=dataset_src, dataset_tgt=dataset_tgt, tokens_in_batch=tokens_in_batch, clean=clean) src, src_mask, tgt, tgt_mask, labels, sent_ids = data_layer() input_type_ids = zeros_transform(input_type_ids=src) src_hiddens = encoder(input_ids=src, token_type_ids=input_type_ids, attention_mask=src_mask) tgt_hiddens = decoder(input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask) log_softmax = t_log_softmax(hidden_states=tgt_hiddens) loss = loss_fn(logits=log_softmax, target_ids=labels) beam_results = None if not training: beam_results = beam_search(hidden_states_src=src_hiddens, input_mask_src=src_mask) return loss, [tgt, loss, beam_results, sent_ids]
def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, training=True): data_layer = nemo_nlp.TranslationDataLayer(tokenizer_src=src_tokenizer, tokenizer_tgt=tgt_tokenizer, dataset_src=dataset_src, dataset_tgt=dataset_tgt, tokens_in_batch=tokens_in_batch, clean=clean) src, src_mask, tgt, tgt_mask, labels, sent_ids = data_layer() src_hiddens = encoder(input_ids=src, input_mask_src=src_mask) tgt_hiddens = decoder(input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask) logits = log_softmax(hidden_states=tgt_hiddens) loss = loss_fn(logits=logits, target_ids=labels) beam_results = None if not training: beam_results = beam_search(hidden_states_src=src_hiddens, input_mask_src=src_mask) return loss, [tgt, loss, beam_results, sent_ids]
# define the parameters for the first sub layer in Transformer block dec_first_sublayer_params = { "first_sub_layer": "self_attention", "attn_score_dropout": args.attn_score_dropout, "attn_layer_dropout": args.attn_layer_dropout, } tokenizer = NemoBertTokenizer(pretrained_model=args.pretrained_model) vocab_size = 8 * math.ceil(tokenizer.vocab_size / 8) tokens_to_add = vocab_size - tokenizer.vocab_size train_data_layer = nemo_nlp.TranslationDataLayer( tokenizer_src=tokenizer, tokenizer_tgt=tokenizer, dataset_src=args.data_dir + "test." + args.src_lang, dataset_tgt=args.data_dir + "test." + args.tgt_lang, tokens_in_batch=args.batch_size, clean=True) eval_data_layers = {} dataset_keys = ["dev_clean", "dev_other", "test_clean", "test_other"] for key in dataset_keys: eval_data_layers[key] = nemo_nlp.TranslationDataLayer( tokenizer_src=tokenizer, tokenizer_tgt=tokenizer, dataset_src=args.data_dir + key + "." + args.src_lang, dataset_tgt=args.data_dir + key + "." + args.tgt_lang, tokens_in_batch=args.eval_batch_size, clean=False)
optimization_level=nemo.core.Optimization.mxprO2, placement=device) # define tokenizer, in this example we use YouTokenToMe tokenizer trained # on joint English & German data for both source and target languages tokenizer = nemo_nlp.YouTokenToMeTokenizer( model_path=f"{args.data_root}/{args.tokenizer_model}") vocab_size = tokenizer.vocab_size # instantiate necessary modules for the whole translation pipeline, namely # data layers, encoder, decoder, output log_softmax, beam_search_translator # and loss function train_data_layer = nemo_nlp.TranslationDataLayer( factory=neural_factory, tokenizer_src=tokenizer, tokenizer_tgt=tokenizer, dataset_src=f"{args.data_root}/{args.train_dataset}.{args.src_lang}", dataset_tgt=f"{args.data_root}/{args.train_dataset}.{args.tgt_lang}", tokens_in_batch=args.batch_size, clean=True) eval_data_layer = nemo_nlp.TranslationDataLayer( factory=neural_factory, tokenizer_src=tokenizer, tokenizer_tgt=tokenizer, dataset_src=f"{args.data_root}/{args.eval_datasets[0]}.{args.src_lang}", dataset_tgt=f"{args.data_root}/{args.eval_datasets[0]}.{args.tgt_lang}", tokens_in_batch=args.eval_batch_size) encoder = nemo_nlp.TransformerEncoderNM( factory=neural_factory, d_embedding=args.d_embedding, d_model=args.d_model, d_inner=args.d_inner,
local_rank=args.local_rank, optimization_level=opt_level, log_dir=args.work_dir, tensorboard_dir=tb_name, ) tokenizer = nemo_nlp.YouTokenToMeTokenizer( model_path=f"{args.data_root}/{args.tokenizer_model}") vocab_size = tokenizer.vocab_size max_sequence_length = 512 train_data_layer = nemo_nlp.TranslationDataLayer( factory=neural_factory, tokenizer_src=tokenizer, tokenizer_tgt=tokenizer, dataset_src=args.data_root + args.train_dataset + "." + args.src_lang, dataset_tgt=args.data_root + args.train_dataset + "." + args.tgt_lang, tokens_in_batch=args.batch_size, clean=True) eval_data_layers = {} for key in args.eval_datasets: eval_data_layers[key] = nemo_nlp.TranslationDataLayer( factory=neural_factory, tokenizer_src=tokenizer, tokenizer_tgt=tokenizer, dataset_src=args.data_root + key + "." + args.src_lang, dataset_tgt=args.data_root + key + "." + args.tgt_lang, tokens_in_batch=args.eval_batch_size, clean=False)