if __name__ == '__main__': random_seed = random.randint(0, 1000) nlp.utils.mkdir(args.ckpt_dir) ctx = mx.gpu(local_rank) dataset_name, vocab = args.dataset_name, None if args.sentencepiece: logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece) if args.dataset_name: warnings.warn('Both --dataset_name and --sentencepiece are provided. ' 'The vocabulary will be loaded based on --sentencepiece') dataset_name = None vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece) model, nsp_loss, mlm_loss, vocab = get_model_loss([ctx], args.model, args.pretrained, dataset_name, vocab, args.dtype, ckpt_dir=args.ckpt_dir, start_step=args.start_step) logging.debug('Model created') data_eval = args.data_eval if args.raw: if args.sentencepiece: tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, num_best=args.sp_nbest, alpha=args.sp_alpha, lower=not args.cased) else: tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=not args.cased) cache_dir = os.path.join(args.ckpt_dir, 'data_eval_cache') cache_file = os.path.join(cache_dir, 'part-000.npz') nlp.utils.mkdir(cache_dir)
dataset_name, vocab = args.dataset_name, None if args.sentencepiece: logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece) if args.dataset_name: warnings.warn( 'Both --dataset_name and --sentencepiece are provided. ' 'The vocabulary will be loaded based on --sentencepiece') dataset_name = None vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece) model, vocab = get_model_loss(ctxs, args.model, args.pretrained, dataset_name, vocab, args.dtype, ckpt_dir=args.ckpt_dir, start_step=args.start_step) logging.info('Model created') data_eval = args.data_eval if args.raw: if args.sentencepiece: tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, lower=not args.cased) else: tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=not args.cased)