コード例 #1
0
ファイル: bert_pretraining.py プロジェクト: panpepson/NeMo
nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
                                   local_rank=args.local_rank,
                                   optimization_level=args.amp_opt_level,
                                   log_dir=args.work_dir,
                                   create_tb_writer=True,
                                   files_to_copy=[__file__],
                                   add_time_to_log_dir=True)

special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
data_desc = BERTPretrainingDataDesc(args.dataset_name, args.data_dir,
                                    args.vocab_size, args.sample_size,
                                    special_tokens, 'train.txt')

if args.tokenizer == "sentence-piece":
    nf.logger.info("To use SentencePieceTokenizer.")
    tokenizer = nemo_nlp.SentencePieceTokenizer(
        model_path=data_desc.tokenizer_model)
    tokenizer.add_special_tokens(special_tokens)
elif args.tokenizer == "nemo-bert":
    nf.logger.info("To use NemoBertTokenizer.")
    vocab_file = os.path.join(args.data_dir, 'vocab.txt')
    # To train on a Chinese dataset, use NemoBertTokenizer
    tokenizer = nemo_nlp.NemoBertTokenizer(vocab_file=vocab_file)
else:
    raise ValueError("Please add your tokenizer"
                     " or use sentence-piece or nemo-bert.")

bert_model = nemo_nlp.huggingface.BERT(vocab_size=tokenizer.vocab_size,
                                       num_layers=args.num_layers,
                                       d_model=args.d_model,
                                       num_heads=args.num_heads,
                                       d_inner=args.d_inner,
コード例 #2
0
if args.fp16 == 1:
    optimization_level = nemo.core.Optimization.mxprO1
elif args.fp16 == 2:
    optimization_level = nemo.core.Optimization.mxprO2
elif args.fp16 == 3:
    optimization_level = nemo.core.Optimization.mxprO3
else:
    optimization_level = nemo.core.Optimization.mxprO0

neural_factory = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,
    local_rank=args.local_rank,
    optimization_level=optimization_level,
    placement=device)

tokenizer = nemo_nlp.SentencePieceTokenizer(model_path=args.tokenizer_model)
tokenizer.add_special_tokens(["[MASK]", "[CLS]", "[SEP]"])
vocab_size = 8 * math.ceil(tokenizer.vocab_size / 8)

bert_model = nemo_nlp.huggingface.BERT(vocab_size=vocab_size,
                                       num_layers=args.num_layers,
                                       d_model=args.d_model,
                                       num_heads=args.num_heads,
                                       d_inner=args.d_inner,
                                       max_seq_length=args.max_sequence_length,
                                       hidden_act="gelu",
                                       factory=neural_factory)

# instantiate necessary modules for the whole translation pipeline, namely
# data layers, BERT encoder, and MLM and NSP loss functions
mlm_log_softmax = nemo_nlp.TransformerLogSoftmaxNM(vocab_size=vocab_size,