Esempio n. 1
0
def create_pipeline(dataset, tokens_in_batch, clean=False, training=True):
    dataset_src = os.path.join(args.data_dir, dataset + "." + args.src_lang)
    dataset_tgt = os.path.join(args.data_dir, dataset + "." + args.tgt_lang)
    data_layer = nemo_nlp.TranslationDataLayer(tokenizer_src=tokenizer,
                                               tokenizer_tgt=tokenizer,
                                               dataset_src=dataset_src,
                                               dataset_tgt=dataset_tgt,
                                               tokens_in_batch=tokens_in_batch,
                                               clean=clean)
    src, src_mask, tgt, tgt_mask, labels, sent_ids = data_layer()
    input_type_ids = zeros_transform(input_type_ids=src)
    src_hiddens = encoder(input_ids=src,
                          token_type_ids=input_type_ids,
                          attention_mask=src_mask)
    tgt_hiddens = decoder(input_ids_tgt=tgt,
                          hidden_states_src=src_hiddens,
                          input_mask_src=src_mask,
                          input_mask_tgt=tgt_mask)
    log_softmax = t_log_softmax(hidden_states=tgt_hiddens)
    loss = loss_fn(logits=log_softmax, target_ids=labels)
    beam_results = None
    if not training:
        beam_results = beam_search(hidden_states_src=src_hiddens,
                                   input_mask_src=src_mask)
    return loss, [tgt, loss, beam_results, sent_ids]
def create_pipeline(dataset_src,
                    dataset_tgt,
                    tokens_in_batch,
                    clean=False,
                    training=True):
    data_layer = nemo_nlp.TranslationDataLayer(tokenizer_src=src_tokenizer,
                                               tokenizer_tgt=tgt_tokenizer,
                                               dataset_src=dataset_src,
                                               dataset_tgt=dataset_tgt,
                                               tokens_in_batch=tokens_in_batch,
                                               clean=clean)
    src, src_mask, tgt, tgt_mask, labels, sent_ids = data_layer()
    src_hiddens = encoder(input_ids=src, input_mask_src=src_mask)
    tgt_hiddens = decoder(input_ids_tgt=tgt,
                          hidden_states_src=src_hiddens,
                          input_mask_src=src_mask,
                          input_mask_tgt=tgt_mask)
    logits = log_softmax(hidden_states=tgt_hiddens)
    loss = loss_fn(logits=logits, target_ids=labels)
    beam_results = None
    if not training:
        beam_results = beam_search(hidden_states_src=src_hiddens,
                                   input_mask_src=src_mask)
    return loss, [tgt, loss, beam_results, sent_ids]
Esempio n. 3
0
# define the parameters for the first sub layer in Transformer block
dec_first_sublayer_params = {
    "first_sub_layer": "self_attention",
    "attn_score_dropout": args.attn_score_dropout,
    "attn_layer_dropout": args.attn_layer_dropout,
}

tokenizer = NemoBertTokenizer(pretrained_model=args.pretrained_model)
vocab_size = 8 * math.ceil(tokenizer.vocab_size / 8)
tokens_to_add = vocab_size - tokenizer.vocab_size

train_data_layer = nemo_nlp.TranslationDataLayer(
    tokenizer_src=tokenizer,
    tokenizer_tgt=tokenizer,
    dataset_src=args.data_dir + "test." + args.src_lang,
    dataset_tgt=args.data_dir + "test." + args.tgt_lang,
    tokens_in_batch=args.batch_size,
    clean=True)

eval_data_layers = {}
dataset_keys = ["dev_clean", "dev_other", "test_clean", "test_other"]

for key in dataset_keys:
    eval_data_layers[key] = nemo_nlp.TranslationDataLayer(
        tokenizer_src=tokenizer,
        tokenizer_tgt=tokenizer,
        dataset_src=args.data_dir + key + "." + args.src_lang,
        dataset_tgt=args.data_dir + key + "." + args.tgt_lang,
        tokens_in_batch=args.eval_batch_size,
        clean=False)
Esempio n. 4
0
    optimization_level=nemo.core.Optimization.mxprO2,
    placement=device)

# define tokenizer, in this example we use YouTokenToMe tokenizer trained
# on joint English & German data for both source and target languages
tokenizer = nemo_nlp.YouTokenToMeTokenizer(
    model_path=f"{args.data_root}/{args.tokenizer_model}")
vocab_size = tokenizer.vocab_size

# instantiate necessary modules for the whole translation pipeline, namely
# data layers, encoder, decoder, output log_softmax, beam_search_translator
# and loss function
train_data_layer = nemo_nlp.TranslationDataLayer(
    factory=neural_factory,
    tokenizer_src=tokenizer,
    tokenizer_tgt=tokenizer,
    dataset_src=f"{args.data_root}/{args.train_dataset}.{args.src_lang}",
    dataset_tgt=f"{args.data_root}/{args.train_dataset}.{args.tgt_lang}",
    tokens_in_batch=args.batch_size,
    clean=True)
eval_data_layer = nemo_nlp.TranslationDataLayer(
    factory=neural_factory,
    tokenizer_src=tokenizer,
    tokenizer_tgt=tokenizer,
    dataset_src=f"{args.data_root}/{args.eval_datasets[0]}.{args.src_lang}",
    dataset_tgt=f"{args.data_root}/{args.eval_datasets[0]}.{args.tgt_lang}",
    tokens_in_batch=args.eval_batch_size)
encoder = nemo_nlp.TransformerEncoderNM(
    factory=neural_factory,
    d_embedding=args.d_embedding,
    d_model=args.d_model,
    d_inner=args.d_inner,
Esempio n. 5
0
    local_rank=args.local_rank,
    optimization_level=opt_level,
    log_dir=args.work_dir,
    tensorboard_dir=tb_name,
)

tokenizer = nemo_nlp.YouTokenToMeTokenizer(
    model_path=f"{args.data_root}/{args.tokenizer_model}")
vocab_size = tokenizer.vocab_size

max_sequence_length = 512

train_data_layer = nemo_nlp.TranslationDataLayer(
    factory=neural_factory,
    tokenizer_src=tokenizer,
    tokenizer_tgt=tokenizer,
    dataset_src=args.data_root + args.train_dataset + "." + args.src_lang,
    dataset_tgt=args.data_root + args.train_dataset + "." + args.tgt_lang,
    tokens_in_batch=args.batch_size,
    clean=True)

eval_data_layers = {}

for key in args.eval_datasets:
    eval_data_layers[key] = nemo_nlp.TranslationDataLayer(
        factory=neural_factory,
        tokenizer_src=tokenizer,
        tokenizer_tgt=tokenizer,
        dataset_src=args.data_root + key + "." + args.src_lang,
        dataset_tgt=args.data_root + key + "." + args.tgt_lang,
        tokens_in_batch=args.eval_batch_size,
        clean=False)