Ejemplo n.º 1
0
special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
data_desc = BERTPretrainingDataDesc(args.dataset_name, args.data_dir,
                                    args.vocab_size, args.sample_size,
                                    special_tokens, 'train.txt')

if args.tokenizer == "sentence-piece":
    nf.logger.info("To use SentencePieceTokenizer.")
    tokenizer = nemo_nlp.SentencePieceTokenizer(
        model_path=data_desc.tokenizer_model)
    tokenizer.add_special_tokens(special_tokens)
elif args.tokenizer == "nemo-bert":
    nf.logger.info("To use NemoBertTokenizer.")
    vocab_file = os.path.join(args.data_dir, 'vocab.txt')
    # To train on a Chinese dataset, use NemoBertTokenizer
    tokenizer = nemo_nlp.NemoBertTokenizer(vocab_file=vocab_file)
else:
    raise ValueError("Please add your tokenizer"
                     " or use sentence-piece or nemo-bert.")

bert_model = nemo_nlp.huggingface.BERT(vocab_size=tokenizer.vocab_size,
                                       num_layers=args.num_layers,
                                       d_model=args.d_model,
                                       num_heads=args.num_heads,
                                       d_inner=args.d_inner,
                                       max_seq_length=args.max_seq_length,
                                       hidden_act="gelu")
""" create necessary modules for the whole translation pipeline, namely
data layers, BERT encoder, and MLM and NSP loss functions
"""
mlm_classifier = nemo_nlp.TokenClassifier(args.d_model,
Ejemplo n.º 2
0
                                       optimization_level=args.amp_opt_level,
                                       log_dir=args.work_dir,
                                       create_tb_writer=True,
                                       files_to_copy=[__file__],
                                       add_time_to_log_dir=True)

    if args.tokenizer == "sentencepiece":
        try:
            tokenizer = nemo_nlp.SentencePieceTokenizer(
                model_path=args.tokenizer_model)
        except Exception:
            raise ValueError("Using --tokenizer=sentencepiece \
                        requires valid --tokenizer_model")
        tokenizer.add_special_tokens(["[CLS]", "[SEP]"])
    elif args.tokenizer == "nemobert":
        tokenizer = nemo_nlp.NemoBertTokenizer(args.pretrained_bert_model)
    else:
        raise ValueError(f"received unexpected tokenizer '{args.tokenizer}'")

    if args.bert_config is not None:
        with open(args.bert_config) as json_file:
            config = json.load(json_file)
        model = nemo_nlp.huggingface.BERT(**config)
    else:
        """ Use this if you're using a standard BERT model.
        To see the list of pretrained models, call:
        nemo_nlp.huggingface.BERT.list_pretrained_models()
        """
        model = nemo_nlp.huggingface.BERT(
            pretrained_model_name=args.pretrained_bert_model)