create_tb_writer=True, files_to_copy=[__file__]) # tie weight of embedding and log_softmax layers if use the same tokenizer # for the source and the target tie_weight = False """Define tokenizer When the src and tgt languages are very different, it's better to use separate tokenizers. """ if args.src_lang == 'en' and args.tgt_lang == 'de': """ We use YouTokenToMe tokenizer trained on joint English & German data for both source and target languages. """ src_tokenizer = nemo_nlp.YouTokenToMeTokenizer( model_path=f"{args.data_dir}/{args.src_tokenizer_model}") src_vocab_size = src_tokenizer.vocab_size if args.src_tokenizer_model == args.tgt_tokenizer_model: tgt_tokenizer = src_tokenizer # source and target use the same tokenizer, set tie_weight to True tie_weight = True else: tgt_tokenizer = nemo_nlp.YouTokenToMeTokenizer( model_path=f"{args.data_dir}/{args.tgt_tokenizer_model}") # source and target use different tokenizers, set tie_weight to False tie_weight = False tgt_vocab_size = tgt_tokenizer.vocab_size elif args.src_lang == 'en' and args.tgt_lang == 'zh': """ We use YouTokenToMeTokenizer for src since the src contains English words and CharTokenizer for tgt since the tgt contains Chinese characters.
name = f"transformer-nmt_{args.src_lang}_{args.tgt_lang}-lr_{args.lr}-" \ f"optim_{args.optimizer}-warmup_{args.warmup_steps}-bs_{args.batch_size}" tb_writer = None # SummaryWriter(name) # instantiate Neural Factory with supported backend device = nemo.core.DeviceType.AllGpu if args.local_rank is not None \ else nemo.core.DeviceType.GPU neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=nemo.core.Optimization.mxprO2, placement=device) # define tokenizer, in this example we use YouTokenToMe tokenizer trained # on joint English & German data for both source and target languages tokenizer = nemo_nlp.YouTokenToMeTokenizer( model_path=f"{args.data_root}/{args.tokenizer_model}") vocab_size = tokenizer.vocab_size # instantiate necessary modules for the whole translation pipeline, namely # data layers, encoder, decoder, output log_softmax, beam_search_translator # and loss function train_data_layer = nemo_nlp.TranslationDataLayer( factory=neural_factory, tokenizer_src=tokenizer, tokenizer_tgt=tokenizer, dataset_src=f"{args.data_root}/{args.train_dataset}.{args.src_lang}", dataset_tgt=f"{args.data_root}/{args.train_dataset}.{args.tgt_lang}", tokens_in_batch=args.batch_size, clean=True) eval_data_layer = nemo_nlp.TranslationDataLayer( factory=neural_factory,