create_tb_writer=True,
                                   files_to_copy=[__file__])

# tie weight of embedding and log_softmax layers if use the same tokenizer
# for the source and the target
tie_weight = False
"""Define tokenizer
When the src and tgt languages are very different, it's better to use separate
tokenizers.
"""
if args.src_lang == 'en' and args.tgt_lang == 'de':
    """
    We use YouTokenToMe tokenizer trained on joint
    English & German data for both source and target languages.
    """
    src_tokenizer = nemo_nlp.YouTokenToMeTokenizer(
        model_path=f"{args.data_dir}/{args.src_tokenizer_model}")
    src_vocab_size = src_tokenizer.vocab_size
    if args.src_tokenizer_model == args.tgt_tokenizer_model:
        tgt_tokenizer = src_tokenizer
        # source and target use the same tokenizer, set tie_weight to True
        tie_weight = True
    else:
        tgt_tokenizer = nemo_nlp.YouTokenToMeTokenizer(
            model_path=f"{args.data_dir}/{args.tgt_tokenizer_model}")
        # source and target use different tokenizers, set tie_weight to False
        tie_weight = False
    tgt_vocab_size = tgt_tokenizer.vocab_size
elif args.src_lang == 'en' and args.tgt_lang == 'zh':
    """
    We use YouTokenToMeTokenizer for src since the src contains English words
    and CharTokenizer for tgt since the tgt contains Chinese characters.
Example #2
0
name = f"transformer-nmt_{args.src_lang}_{args.tgt_lang}-lr_{args.lr}-" \
    f"optim_{args.optimizer}-warmup_{args.warmup_steps}-bs_{args.batch_size}"
tb_writer = None  # SummaryWriter(name)

# instantiate Neural Factory with supported backend
device = nemo.core.DeviceType.AllGpu if args.local_rank is not None \
    else nemo.core.DeviceType.GPU
neural_factory = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,
    local_rank=args.local_rank,
    optimization_level=nemo.core.Optimization.mxprO2,
    placement=device)

# define tokenizer, in this example we use YouTokenToMe tokenizer trained
# on joint English & German data for both source and target languages
tokenizer = nemo_nlp.YouTokenToMeTokenizer(
    model_path=f"{args.data_root}/{args.tokenizer_model}")
vocab_size = tokenizer.vocab_size

# instantiate necessary modules for the whole translation pipeline, namely
# data layers, encoder, decoder, output log_softmax, beam_search_translator
# and loss function
train_data_layer = nemo_nlp.TranslationDataLayer(
    factory=neural_factory,
    tokenizer_src=tokenizer,
    tokenizer_tgt=tokenizer,
    dataset_src=f"{args.data_root}/{args.train_dataset}.{args.src_lang}",
    dataset_tgt=f"{args.data_root}/{args.train_dataset}.{args.tgt_lang}",
    tokens_in_batch=args.batch_size,
    clean=True)
eval_data_layer = nemo_nlp.TranslationDataLayer(
    factory=neural_factory,