def setup_logging(args, local_rank): """ Setup logging configuration as well as random seed """ logging_config(args.output_dir, name='finetune_squad{}'.format(args.version),# avoid race overwrite_handler=True, console=(local_rank == 0)) logging.info(args) set_seed(args.seed) logging.debug('Random seed set to {}'.format(args.seed))
def test_logging_config(): logger = logging.getLogger(__name__) with tempfile.TemporaryDirectory() as root: logging_config(folder=root, logger=logger, name='test') file_names = os.listdir(root) assert file_names[0] == 'test.log' file_size = Path(os.path.join(root, 'test.log')).stat().st_size assert file_size == 0 logger.info('123') for handler in logger.handlers: handler.flush() file_size_test1 = Path(os.path.join(root, 'test.log')).stat().st_size assert file_size_test1 > 0 logging_config(folder=root, logger=logger, name='foo', overwrite_handler=False) logger.info('123') for handler in logger.handlers: handler.flush() file_size_test2 = Path(os.path.join(root, 'test.log')).stat().st_size file_size_foo1 = Path(os.path.join(root, 'foo.log')).stat().st_size assert file_size_test2 > file_size_test1 assert file_size_foo1 > 0 # After overwrite, the old hanlder will be removed logging_config(folder=root, logger=logger, name='zoo', overwrite_handler=True) logger.info('12345') for handler in logger.handlers: handler.flush() file_size_zoo1 = Path(os.path.join(root, 'zoo.log')).stat().st_size file_size_test3 = Path(os.path.join(root, 'test.log')).stat().st_size file_size_foo2 = Path(os.path.join(root, 'foo.log')).stat().st_size assert file_size_test3 == file_size_test2 assert file_size_foo2 == file_size_foo1 assert file_size_zoo1 > 0
def train(args): _, num_parts, rank, local_rank, _, ctx_l = init_comm( args.comm_backend, args.gpus) if args.comm_backend == 'horovod': logging_config( args.save_dir, name=f'train_transformer_rank{rank}_local{local_rank}_{num_parts}', console=(rank == 0)) logging.info(args) else: logging_config(args.save_dir, name='train_transformer', console=True) logging.info(args) use_amp = args.fp16 if use_amp: from mxnet import amp src_tokenizer = create_tokenizer(args.src_tokenizer, args.src_subword_model_path, args.src_vocab_path) tgt_tokenizer = create_tokenizer(args.tgt_tokenizer, args.tgt_subword_model_path, args.tgt_vocab_path) base_tgt_tokenizer = MosesTokenizer(args.tgt_lang) src_vocab = src_tokenizer.vocab tgt_vocab = tgt_tokenizer.vocab train_src_data, train_tgt_data = load_dataset_with_cache( args.train_src_corpus, args.train_tgt_corpus, src_tokenizer, tgt_tokenizer, args.overwrite_cache, local_rank, max_src_length=args.max_src_length, max_tgt_length=args.max_tgt_length, pretokenized=not args.tokenize) dev_src_data, dev_tgt_data = load_dataset_with_cache( args.dev_src_corpus, args.dev_tgt_corpus, src_tokenizer, tgt_tokenizer, args.overwrite_cache, local_rank, pretokenized=not args.tokenize) tgt_detok_sentences = [] tgt_raw_sentences = [] with open(args.dev_tgt_corpus, 'r') as in_f: for line in in_f: tgt_detok_sentences.append( base_tgt_tokenizer.decode( tgt_tokenizer.decode(line.split()).split())) with open(args.dev_tgt_raw_corpus, 'r') as in_f: for line in in_f: tgt_raw_sentences.append(line.strip()) data_train = gluon.data.SimpleDataset([ (src_tokens, tgt_tokens, len(src_tokens), len(tgt_tokens), i) for i, (src_tokens, tgt_tokens) in enumerate(zip(train_src_data, train_tgt_data)) ]) val_samples = [ (src_tokens, tgt_tokens, len(src_tokens), len(tgt_tokens), i) for i, (src_tokens, tgt_tokens) in enumerate(zip(dev_src_data, dev_tgt_data)) ] if args.comm_backend == 'horovod': slice_begin = rank * (len(val_samples) // num_parts) slice_end = min((rank + 1) * (len(val_samples) // num_parts), len(val_samples)) data_val = gluon.data.SimpleDataset(val_samples[slice_begin:slice_end]) else: data_val = gluon.data.SimpleDataset(val_samples) # Construct the model + loss function if args.cfg.endswith('.yml'): cfg = TransformerModel.get_cfg().clone_merge(args.cfg) else: cfg = TransformerModel.get_cfg(args.cfg) cfg.defrost() cfg.MODEL.src_vocab_size = len(src_vocab) cfg.MODEL.tgt_vocab_size = len(tgt_vocab) cfg.MODEL.layout = 'TN' cfg.freeze() model = TransformerModel.from_cfg(cfg) model.initialize(mx.init.Xavier(magnitude=args.magnitude), ctx=ctx_l) model.hybridize() for v in model.collect_params().values(): if v.grad_req != 'null': v.grad_req = 'add' # Do not apply weight decay to all the LayerNorm and bias for _, v in model.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 param_dict = deduplicate_param_dict(model.collect_params()) inference_model = TransformerInference(model=model) inference_model.hybridize() if local_rank == 0: logging.info(model) with open(os.path.join(args.save_dir, 'config.yml'), 'w') as cfg_f: cfg_f.write(cfg.dump()) label_smooth_loss = LabelSmoothCrossEntropyLoss( num_labels=len(tgt_vocab), alpha=args.label_smooth_alpha, from_logits=False) label_smooth_loss.hybridize() # Construct the beam search sampler scorer = BeamSearchScorer(alpha=args.lp_alpha, K=args.lp_k, from_logits=False) beam_search_sampler = BeamSearchSampler(beam_size=args.beam_size, decoder=inference_model, vocab_size=len(tgt_vocab), eos_id=tgt_vocab.eos_id, scorer=scorer, stochastic=False, max_length_a=args.max_length_a, max_length_b=args.max_length_b) logging.info(beam_search_sampler) if args.comm_backend == 'horovod': hvd.broadcast_parameters(param_dict, root_rank=0) # Construct the trainer if args.lr is None: base_lr = 2.0 / math.sqrt(args.num_units) / math.sqrt( args.warmup_steps) else: base_lr = args.lr lr_scheduler = InverseSquareRootScheduler( warmup_steps=args.warmup_steps, base_lr=base_lr, warmup_init_lr=args.warmup_init_lr) optimizer_params = { 'learning_rate': args.lr, 'beta1': 0.9, 'beta2': 0.997, 'epsilon': 1e-9, 'lr_scheduler': lr_scheduler, 'wd': args.wd } user_provided_ptimizer_params = json.loads(args.optimizer_params) optimizer_params.update(user_provided_ptimizer_params) if args.fp16: optimizer_params.update({'multi_precision': True}) if args.comm_backend == 'horovod': trainer = hvd.DistributedTrainer(param_dict, args.optimizer, optimizer_params) else: trainer = gluon.Trainer(param_dict, args.optimizer, optimizer_params, update_on_kvstore=False) # Load Data if args.sampler == 'BoundedBudgetSampler': train_batch_sampler = BoundedBudgetSampler( lengths=[(ele[2], ele[3]) for ele in data_train], max_num_tokens=args.max_num_tokens, max_num_sentences=args.max_num_sentences, shuffle=True, seed=args.seed) elif args.sampler == 'FixedBucketSampler': if args.comm_backend == 'horovod': raise NotImplementedError( 'FixedBucketSampler does not support horovod at present') if args.bucket_scheme == 'constant': bucket_scheme = ConstWidthBucket() elif args.bucket_scheme == 'linear': bucket_scheme = LinearWidthBucket() elif args.bucket_scheme == 'exp': bucket_scheme = ExpWidthBucket(bucket_len_step=1.2) else: raise NotImplementedError # TODO(sxjscience) Support auto-bucket-size tuning train_batch_sampler = FixedBucketSampler(lengths=[ (ele[2], ele[3]) for ele in data_train ], batch_size=args.batch_size, num_buckets=args.num_buckets, ratio=args.bucket_ratio, shuffle=True, use_average_length=True, bucket_scheme=bucket_scheme, seed=args.seed) else: raise NotImplementedError num_updates_per_epoch = int( math.ceil( len(train_batch_sampler) / (num_parts * len(ctx_l) * args.num_accumulated))) # Convert the batch sampler to multiple shards if num_parts > 1: train_batch_sampler = ShardedIterator(train_batch_sampler, num_parts=num_parts, part_index=rank, even_size=True, seed=args.seed + 1000 * rank) logging.info(train_batch_sampler) batchify_fn = bf.Tuple(bf.Pad(), bf.Pad(), bf.Stack(), bf.Stack(), bf.Stack()) train_data_loader = gluon.data.DataLoader( data_train, batch_sampler=train_batch_sampler, batchify_fn=batchify_fn, num_workers=0) val_data_loader = gluon.data.DataLoader(data_val, batch_size=args.val_batch_size, batchify_fn=batchify_fn, num_workers=0, shuffle=False) params = [p for p in param_dict.values() if p.grad_req != 'null'] model_averager = AverageSGDTracker(param_dict) log_start_time = time.time() num_params, num_fixed_params = None, None # TODO(sxjscience) Add a log metric class log_avg_loss_l = [mx.np.array(0.0, ctx=ctx) for ctx in ctx_l] # Maintain the denominator of the loss. log_avg_loss_denom_l = [mx.np.array(0.0, ctx=ctx) for ctx in ctx_l] log_wc_l = [mx.np.array(0, dtype=np.int64, ctx=ctx) for ctx in ctx_l] log_tgt_wc_l = [mx.np.array(0, dtype=np.int64, ctx=ctx) for ctx in ctx_l] log_avg_grad_norm = 0 log_iter_num = 0 if local_rank == 0: writer = SummaryWriter( logdir=os.path.join(args.save_dir, 'tensorboard')) if use_amp: amp.init_trainer(trainer) train_multi_data_loader = grouper(repeat(train_data_loader), len(ctx_l)) # when args.epochs < 0, the model will keep training if args.epochs < 0: if args.max_update > 0: total_train_iters = args.max_update if args.num_averages > 0: assert args.num_averages <= total_train_iters // args.save_iterval_update avg_start_iter = ( total_train_iters // args.save_iterval_update - args.num_averages) * args.save_iterval_update else: avg_start_iter = -1 else: total_train_iters = np.inf avg_start_iter = -1 else: total_train_iters = args.epochs * num_updates_per_epoch if args.num_averages > 0: assert args.num_averages <= args.epochs avg_start_iter = (args.epochs - args.num_average) * num_updates_per_epoch else: avg_start_iter = -1 # Here, we are manually setting up the scale to 1.0 because # in horovod, the scale can be the number of workers: # See the code here: https://github.com/horovod/horovod/blob/125115583b7029196e2ec530decd4209459d5479/horovod/mxnet/__init__.py#L141 # Since we will need to use the dynamic scaling in amp, we will manually call amp.unscale(). # A scale that is larger than 1.0 can be problematic in this case. trainer._scale = 1.0 if args.max_num_tokens > 0: const_scale = args.max_num_tokens else: const_scale = 100 train_start_time = time.time() for train_iter in range(total_train_iters): model.zero_grad() loss_denom_l = [mx.np.array(0.0, ctx=ctx) for ctx in ctx_l] for i in range(args.num_accumulated): loss_l = [] sample_data_l = next(train_multi_data_loader) for j, (sample_data, ctx) in enumerate(zip(sample_data_l, ctx_l)): src_token_ids, tgt_token_ids, src_valid_length,\ tgt_valid_length, sample_ids = sample_data src_token_ids = src_token_ids.as_in_ctx(ctx) tgt_token_ids = tgt_token_ids.as_in_ctx(ctx) src_valid_length = src_valid_length.as_in_ctx(ctx) tgt_valid_length = tgt_valid_length.as_in_ctx(ctx) src_wc, tgt_wc, bs = src_valid_length.sum(), \ tgt_valid_length.sum(), src_token_ids.shape[0] log_wc_l[j] += src_wc + tgt_wc log_tgt_wc_l[j] += tgt_wc token_count = (tgt_valid_length - 1).sum() loss_denom_l[j] += token_count / const_scale log_avg_loss_denom_l[j] += token_count / const_scale with mx.autograd.record(): if model.layout == 'NT': tgt_pred = model(src_token_ids, src_valid_length, tgt_token_ids[:, :-1], tgt_valid_length - 1) tgt_labels = tgt_token_ids[:, 1:] loss = label_smooth_loss(tgt_pred, tgt_labels) loss = mx.npx.sequence_mask( loss, sequence_length=tgt_valid_length - 1, use_sequence_length=True, axis=1) loss = loss.sum() / const_scale loss_l.append(loss) elif model.layout == 'TN': tgt_pred = model(src_token_ids.T, src_valid_length, tgt_token_ids.T[:-1, :], tgt_valid_length - 1) tgt_labels = tgt_token_ids.T[1:, :] loss = label_smooth_loss(tgt_pred, tgt_labels) loss = mx.npx.sequence_mask( loss, sequence_length=tgt_valid_length - 1, use_sequence_length=True, axis=0) loss = loss.sum() / const_scale loss_l.append(loss) log_avg_loss_l[j] += loss if use_amp: with mx.autograd.record(): with amp.scale_loss(loss_l, trainer) as amp_loss_l: for loss in amp_loss_l: loss.backward() else: with mx.autograd.record(): for loss in loss_l: loss.backward() # Print the total number of parameters if local_rank == 0 and num_params is None: num_params, num_fixed_params = count_parameters(param_dict) logging.info( 'Total Number of Parameters (not-fixed/fixed): {}/{}'.format( num_params, num_fixed_params)) # All-Reduce the gradient trainer.allreduce_grads() if args.comm_backend == 'horovod': # All-Reduce the loss denominator assert len(loss_denom_l) == 1 loss_denom = hvd.allreduce(loss_denom_l[0], average=False).asnumpy() else: loss_denom = sum([ele.asnumpy() for ele in loss_denom_l]) if use_amp: # We need to first unscale the gradient and then perform allreduce. grad_scale = trainer.amp_loss_scale * loss_denom else: grad_scale = loss_denom if args.max_grad_norm is not None: total_norm, ratio, is_finite\ = clip_grad_global_norm(params, args.max_grad_norm * grad_scale) total_norm = total_norm / grad_scale else: total_norm = grad_global_norm(params) total_norm = total_norm / grad_scale log_avg_grad_norm += total_norm log_iter_num += 1 trainer.update(loss_denom, ignore_stale_grad=True) if avg_start_iter > 0 and train_iter >= avg_start_iter: model_averager.step() if ((train_iter + 1) % args.log_interval == 0 or train_iter + 1 == total_train_iters): if args.comm_backend == 'horovod': # Use allreduce to get the total number of tokens and loss log_wc = hvd.allreduce(log_wc_l[0], average=False).asnumpy() log_tgt_wc = hvd.allreduce(log_tgt_wc_l[0], average=False).asnumpy() log_avg_loss = hvd.allreduce(log_avg_loss_l[0] / log_avg_loss_denom_l[0], average=True) log_avg_loss = log_avg_loss.asnumpy() else: log_wc = sum([ele.asnumpy() for ele in log_wc_l]) log_tgt_wc = sum([ele.asnumpy() for ele in log_tgt_wc_l]) log_avg_loss =\ sum([log_avg_loss_l[i].asnumpy() / log_avg_loss_denom_l[i].asnumpy() for i in range(len(log_avg_loss_l))]) / len(log_avg_loss_l) log_avg_grad_norm = log_avg_grad_norm / log_iter_num log_end_time = time.time() wps = log_wc / (log_end_time - log_start_time) epoch_id = train_iter // num_updates_per_epoch logging.info( '[Epoch {} Iter {}/{}, Overall {}/{}] loss={:.4f}, ppl={:.4f}, ' 'throughput={:.2f}K wps, total wc={:.2f}K, wpb={:.2f}K,' ' LR={}, gnorm={:.4f}, ETA={:.2f}h'.format( epoch_id, train_iter % num_updates_per_epoch + 1, num_updates_per_epoch, train_iter + 1, total_train_iters, log_avg_loss, np.exp(log_avg_loss), wps / 1000, log_wc / 1000, log_tgt_wc / 1000 / log_iter_num, trainer.learning_rate, log_avg_grad_norm, (log_end_time - train_start_time) / (train_iter + 1) * (total_train_iters - train_iter - 1) / 3600)) if local_rank == 0: writer.add_scalar('throughput_wps', wps, train_iter) writer.add_scalar('train_loss', log_avg_loss, train_iter) writer.add_scalar('lr', trainer.learning_rate, train_iter) writer.add_scalar('grad_norm', log_avg_grad_norm, train_iter) # Reinitialize the log variables log_start_time = time.time() log_avg_loss_l = [mx.np.array(0.0, ctx=ctx) for ctx in ctx_l] log_avg_loss_denom_l = [mx.np.array(0.0, ctx=ctx) for ctx in ctx_l] log_avg_grad_norm = 0 log_iter_num = 0 log_wc_l = [ mx.np.array(0, dtype=np.int64, ctx=ctx) for ctx in ctx_l ] log_tgt_wc_l = [ mx.np.array(0, dtype=np.int64, ctx=ctx) for ctx in ctx_l ] if (args.max_update > 0 and (train_iter + 1) % args.save_interval_update == 0) \ or ((train_iter + 1) % num_updates_per_epoch == 0) \ or train_iter + 1 == total_train_iters: epoch_id = (train_iter + 1) // num_updates_per_epoch if local_rank == 0: if args.max_update <= 0: model.save_parameters(os.path.join( args.save_dir, 'epoch{}.params'.format(epoch_id)), deduplicate=True) else: model.save_parameters(os.path.join( args.save_dir, 'iter{}.params'.format(train_iter + 1)), deduplicate=True) avg_val_loss, ntokens, pred_sentences, pred_lengths, sentence_ids\ = validation(model, val_data_loader, inference_model, beam_search_sampler, tgt_tokenizer, ctx_l) if args.comm_backend == 'horovod': flatten_pred_sentences = np.concatenate(pred_sentences, axis=0) all_val_loss = hvd.allgather( mx.np.array([avg_val_loss * ntokens], dtype=np.float32, ctx=ctx_l[0])) all_ntokens = hvd.allgather( mx.np.array([ntokens], dtype=np.int64, ctx=ctx_l[0])) flatten_pred_sentences = hvd.allgather( mx.np.array(flatten_pred_sentences, dtype=np.int32, ctx=ctx_l[0])) pred_lengths = hvd.allgather( mx.np.array(pred_lengths, dtype=np.int64, ctx=ctx_l[0])) sentence_ids = hvd.allgather( mx.np.array(sentence_ids, dtype=np.int64, ctx=ctx_l[0])) avg_val_loss = all_val_loss.asnumpy().sum( ) / all_ntokens.asnumpy().sum() flatten_pred_sentences = flatten_pred_sentences.asnumpy() pred_lengths = pred_lengths.asnumpy() sentence_ids = sentence_ids.asnumpy() pred_sentences = [None for _ in range(len(sentence_ids))] ptr = 0 assert sentence_ids.min() == 0 and sentence_ids.max( ) == len(sentence_ids) - 1 for sentence_id, length in zip(sentence_ids, pred_lengths): pred_sentences[sentence_id] = flatten_pred_sentences[ptr:( ptr + length)] ptr += length if local_rank == 0: # Perform detokenization pred_sentences_bpe_decode = [] pred_sentences_raw = [] for sentence in pred_sentences: bpe_decode_sentence = tgt_tokenizer.decode( sentence.tolist()) raw_sentence = base_tgt_tokenizer.decode( bpe_decode_sentence.split()) pred_sentences_bpe_decode.append(bpe_decode_sentence) pred_sentences_raw.append(raw_sentence) detok_sacrebleu_out = sacrebleu.corpus_bleu( sys_stream=pred_sentences_bpe_decode, ref_streams=[tgt_detok_sentences]) raw_sacrebleu_out = sacrebleu.corpus_bleu( sys_stream=pred_sentences_raw, ref_streams=[tgt_raw_sentences]) with open( os.path.join(args.save_dir, f'epoch{epoch_id}_dev_prediction.txt'), 'w') as of: for line in pred_sentences_raw: of.write(line + '\n') logging.info( '[Epoch {}][Iter {}/{}] validation loss/ppl={:.4f}/{:.4f}, ' 'SacreBlEU={}, Detok SacreBLUE={}'.format( epoch_id, train_iter, total_train_iters, avg_val_loss, np.exp(avg_val_loss), raw_sacrebleu_out.score, detok_sacrebleu_out.score)) writer.add_scalar('valid_loss', avg_val_loss, train_iter) writer.add_scalar('valid_bleu', raw_sacrebleu_out.score, train_iter) if args.num_averages > 0: model_averager.copy_back( param_dict) # TODO(sxjscience) Rewrite using update model.save_parameters(os.path.join(args.save_dir, 'average.params'), deduplicate=True)
with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of: of.write(gluon_cfg.dump()) ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu() gluon_xlmr = convert_params(fairseq_xlmr, gluon_cfg, ctx) if args.test: test_model(fairseq_xlmr, gluon_xlmr, args.gpu) gluon_xlmr.save_parameters(os.path.join(args.save_dir, 'model_mlm.params'), deduplicate=True) logging.info('Convert the RoBERTa MLM model in {} to {}'. format(os.path.join(args.fairseq_model_path, 'model.pt'), \ os.path.join(args.save_dir, 'model_mlm.params'))) gluon_xlmr.backbone_model.save_parameters(os.path.join( args.save_dir, 'model.params'), deduplicate=True) logging.info('Convert the RoBERTa backbone model in {} to {}'. format(os.path.join(args.fairseq_model_path, 'model.pt'), \ os.path.join(args.save_dir, 'model.params'))) logging.info('Conversion finished!') logging.info('Statistics:') rename(args.save_dir) if __name__ == '__main__': args = parse_args() logging_config() convert_fairseq_model(args)
def is_tf_available(): return tensorflow is not None def is_mxnet_available(): return mxnet is not None if platform.system() == "Windows": from signal import CTRL_C_EVENT as SIGKILL else: from signal import SIGKILL logger = logging.getLogger(__name__) # pylint: disable=invalid-name logging_config(folder='gluonnlp_benchmark', name='benchmark', logger=logger) _is_memory_tracing_enabled = False BenchmarkOutput = namedtuple( "BenchmarkOutput", [ "inference_result", "train_result", ], ) def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]: """
def is_tf_available(): return tensorflow is not None def is_mxnet_available(): return mxnet is not None if platform.system() == "Windows": from signal import CTRL_C_EVENT as SIGKILL else: from signal import SIGKILL logger = logging.getLogger(__name__) # pylint: disable=invalid-name logging_config(logger=logger) _is_memory_tracing_enabled = False BenchmarkOutput = namedtuple( "BenchmarkOutput", [ "inference_result", "train_result", ], ) def separate_process_wrapper_fn( func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:
def parse_args(): parser = argparse.ArgumentParser( description='Transformer for Neural Machine Translation.') parser.add_argument('--train_src_corpus', type=str, help='The source training corpus.') parser.add_argument('--train_tgt_corpus', type=str, help='The target training corpus.') parser.add_argument('--dev_src_corpus', type=str, help='The source dev corpus.') parser.add_argument('--dev_tgt_corpus', type=str, help='The target dev corpus.') parser.add_argument( '--src_tokenizer', choices=[ 'spm', 'subword_nmt', 'yttm', 'hf_bytebpe', 'hf_wordpiece', 'hf_bpe', 'whitespace' ], default='whitespace', type=str, help='The source tokenizer. ' 'Whitespace tokenizer supports processing pre-encoded corpus, ' 'and the tokenizers besides whitespace supports online encoding.') parser.add_argument('--tgt_tokenizer', choices=[ 'spm', 'subword_nmt', 'yttm', 'hf_bytebpe', 'hf_wordpiece', 'hf_bpe', 'whitespace' ], default='whitespace', type=str, help='The target tokenizer.') parser.add_argument('--src_subword_model_path', type=str, help='Path to the source subword model.') parser.add_argument('--src_vocab_path', type=str, help='Path to the source vocab.') parser.add_argument('--tgt_subword_model_path', type=str, help='Path to the target subword model.') parser.add_argument('--tgt_vocab_path', type=str, help='Path to the target vocab.') parser.add_argument('--seed', type=int, default=100, help='The random seed.') parser.add_argument( '--epochs', type=int, default=30, help='Upper epoch limit, ' 'the model will keep training when epochs < 0 and max_update < 0.') parser.add_argument( '--max_update', type=int, default=-1, help='Max update steps, when max_update > 0, epochs will be set to -1, ' 'each update step contains gpu_num * num_accumulated batches.') parser.add_argument( '--save_interval_update', type=int, default=500, help='Update interval of saving checkpoints while using max_update.') parser.add_argument( '--cfg', type=str, default='transformer_base', help='Configuration of the transformer model. ' 'You may select a yml file or use the prebuild configurations.') parser.add_argument('--label_smooth_alpha', type=float, default=0.1, help='Weight of label smoothing') parser.add_argument('--sampler', type=str, choices=['BoundedBudgetSampler', 'FixedBucketSampler'], default='FixedBucketSampler', help='Type of sampler') parser.add_argument( '--batch_size', type=int, default=2700, help='Batch size. Number of tokens per gpu in a minibatch.') parser.add_argument('--val_batch_size', type=int, default=16, help='Batch size for evaluation.') parser.add_argument('--num_buckets', type=int, default=20, help='Bucket number.') parser.add_argument( '--bucket_scheme', type=str, default='exp', help='Strategy for generating bucket keys. It supports: ' '"constant": all the buckets have the same width; ' '"linear": the width of bucket increases linearly; ' '"exp": the width of bucket increases exponentially') parser.add_argument( '--bucket_ratio', type=float, default=0.0, help='Ratio for increasing the throughput of the bucketing') parser.add_argument( '--max_num_tokens', type=int, default=-1, help= 'max tokens num of each batch, applicable while using BoundedBudgetSampler' ) parser.add_argument( '--max_num_sentences', type=int, default=-1, help= 'max sentences num of each batch, applicable while using BoundedBudgetSampler' ) parser.add_argument( '--lr', type=float, default=0.002, help='The learning rate at the end of the warmup stage. ' 'If it is not given, we will use the formula suggested in the ' 'original Transformer paper:' ' 1.0 / sqrt(d_model) / sqrt(warmup_steps). ' 'Otherwise, we will use the given lr as the final learning rate in ' 'the warmup phase.') parser.add_argument( '--warmup_steps', type=int, default=4000, help='number of warmup steps used in NOAM\'s stepsize schedule') parser.add_argument( '--warmup_init_lr', type=float, default=0.0, help='Initial learning rate at the beginning of the warm-up stage') parser.add_argument( '--num_accumulated', type=int, default=32, help='Number of steps to accumulate the gradients. ' 'This is useful to mimic large batch training with limited gpu memory') parser.add_argument('--magnitude', type=float, default=3.0, help='Magnitude of Xavier initialization') parser.add_argument('--num_averages', type=int, default=-1, help='Perform final testing based on the ' 'average of last num_averages checkpoints. ' 'Use num_average will cause extra gpu memory usage.') parser.add_argument('--log_interval', type=int, default=10, metavar='N', help='report interval') parser.add_argument( '--save_dir', type=str, default='transformer_out', help='directory path to save the final model and training log') parser.add_argument('--overwrite_cache', action='store_true') parser.add_argument('--fp16', action='store_true', help='Whether to use dtype float16') parser.add_argument('--comm_backend', type=str, default='device', choices=['horovod', 'dist_sync_device', 'device'], help='Communication backend.') parser.add_argument( '--gpus', type=str, help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.') args = parser.parse_args() if args.max_update > 0: args.epochs = -1 logging_config(args.save_dir, console=True) logging.info(args) return args
def train(args): _, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm( args.comm_backend, args.gpus) level = logging.DEBUG if args.verbose else logging.INFO logging_config( args.ckpt_dir, name='pretrain_bert_' + str(rank), # avoid race level=level, console=(local_rank == 0)) logging.info(args) logging.debug('Random seed set to {}'.format(args.seed)) set_seed(args.seed) logging.info('Training info: num_buckets: {}, ' 'num_workers: {}, rank: {}'.format(args.num_buckets, num_workers, rank)) cfg, tokenizer, model = get_pretraining_model(args.model_name, ctx_l) if args.start_step: logging.info('Restart training from {}'.format(args.start_step)) parameters_option(args.start_step, model, args.ckpt_dir, 'Loading', ctx_l) else: model.initialize(ctx=ctx_l) model.hybridize() if args.raw: get_dataset_fn = functools.partial( get_pretrain_data_text, max_seq_length=args.max_seq_length, short_seq_prob=args.short_seq_prob, masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, whole_word_mask=args.whole_word_mask, random_next_sentence=args.random_next_sentence, tokenizer=tokenizer, circle_length=args.circle_length, repeat=args.repeat, dataset_cached=args.dataset_cached, num_max_dataset_cached=args.num_max_dataset_cached) else: get_dataset_fn = get_pretrain_data_npz data_train = get_dataset_fn(args.data, args.batch_size, shuffle=True, num_buckets=args.num_buckets, vocab=tokenizer.vocab, num_parts=num_workers, part_idx=rank, num_dataset_workers=args.num_dataset_workers, num_batch_workers=args.num_batch_workers) param_dict = model.collect_params() # Do not apply weight decay to all the LayerNorm and bias for _, v in model.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 # Set grad_req if gradient accumulation is required params = [p for p in param_dict.values() if p.grad_req != 'null'] num_accumulated = args.num_accumulated if num_accumulated > 1: logging.info( 'Using gradient accumulation. Effective global batch size = {}'. format(num_accumulated * args.batch_size * len(ctx_l) * num_workers)) for p in params: p.grad_req = 'add' num_steps = args.num_steps warmup_steps = int(num_steps * args.warmup_ratio) log_interval = args.log_interval save_interval = args.ckpt_interval logging.info( '#Total Training Steps={}, Warmup Steps={}, Save Interval={}'.format( num_steps, warmup_steps, save_interval)) optimizer_params = {'learning_rate': args.lr, 'wd': args.wd} if args.optimizer == 'adamw': optimizer_params.update({ 'beta1': 0.9, 'beta2': 0.999, 'epsilon': 1e-6, 'correct_bias': False, }) if args.comm_backend == 'horovod': trainer = hvd.DistributedTrainer(param_dict, args.optimizer, optimizer_params) elif args.comm_backend == 'byteps': trainer = bps.DistributedTrainer(param_dict, args.optimizer, optimizer_params) else: trainer = mx.gluon.Trainer(param_dict, args.optimizer, optimizer_params, update_on_kvstore=False) if args.start_step: logging.info('Restart training from {}'.format(args.start_step)) states_option(args.start_step, trainer, args.ckpt_dir, local_rank, 'Loading') # backend specific implementation if args.comm_backend == 'byteps': trainer._init_params() if args.comm_backend == 'horovod': # Horovod: fetch and broadcast parameters hvd.broadcast_parameters(param_dict, root_rank=0) # prepare the loss function nsp_loss_fn = mx.gluon.loss.SoftmaxCELoss() mlm_loss_fn = mx.gluon.loss.SoftmaxCELoss() nsp_loss_fn.hybridize() mlm_loss_fn.hybridize() mlm_metric = MaskedAccuracy() nsp_metric = MaskedAccuracy() mlm_metric.reset() nsp_metric.reset() step_num = args.start_step if args.phase2: step_num -= args.phase1_num_steps running_mlm_loss, running_nsp_loss = 0., 0. running_num_tks = 0 train_start_time = time.time() tic = time.time() # start training train_loop_dataloader = grouper(repeat(data_train), len(ctx_l)) while step_num < num_steps: step_num += 1 for _ in range(num_accumulated): sample_l = next(train_loop_dataloader) mlm_loss_l = [] nsp_loss_l = [] loss_l = [] ns_label_list, ns_pred_list = [], [] mask_label_list, mask_pred_list, mask_weight_list = [], [], [] for sample, ctx in zip(sample_l, ctx_l): # prepare data (input_id, masked_id, masked_position, masked_weight, \ next_sentence_label, segment_id, valid_length) = sample input_id = input_id.as_in_ctx(ctx) masked_id = masked_id.as_in_ctx(ctx) masked_position = masked_position.as_in_ctx(ctx) masked_weight = masked_weight.as_in_ctx(ctx) next_sentence_label = next_sentence_label.as_in_ctx(ctx) segment_id = segment_id.as_in_ctx(ctx) valid_length = valid_length.as_in_ctx(ctx) with mx.autograd.record(): _, _, nsp_score, mlm_scores = model( input_id, segment_id, valid_length, masked_position) denominator = (masked_weight.sum() + 1e-8) * num_accumulated * len(ctx_l) mlm_scores_r = mx.npx.reshape(mlm_scores, (-5, -1)) masked_id_r = masked_id.reshape((-1, )) mlm_loss = mlm_loss_fn(mlm_scores_r, masked_id_r, masked_weight.reshape( (-1, 1))).sum() / denominator denominator = num_accumulated * len(ctx_l) nsp_loss = nsp_loss_fn( nsp_score, next_sentence_label).mean() / denominator mlm_loss_l.append(mlm_loss) nsp_loss_l.append(nsp_loss) loss_l.append(mlm_loss + nsp_loss) mask_label_list.append(masked_id_r) mask_pred_list.append(mlm_scores_r) mask_weight_list.append(masked_weight.reshape((-1, ))) ns_label_list.append(next_sentence_label) ns_pred_list.append(nsp_score) running_num_tks += valid_length.sum().as_in_ctx(mx.cpu()) for loss in loss_l: loss.backward() running_mlm_loss += sum([ ele.as_in_ctx(mx.cpu()) for ele in mlm_loss_l ]).asnumpy().item() running_nsp_loss += sum([ ele.as_in_ctx(mx.cpu()) for ele in nsp_loss_l ]).asnumpy().item() mlm_metric.update(mask_label_list, mask_pred_list, mask_weight_list) nsp_metric.update(ns_label_list, ns_pred_list) # update trainer.allreduce_grads() total_norm, ratio, is_finite = clip_grad_global_norm( params, args.max_grad_norm * num_workers) total_norm = total_norm / num_workers # update learning rate scheduled_lr = args.lr if step_num <= warmup_steps: scheduled_lr *= step_num / warmup_steps else: offset = (num_steps - step_num) / (num_steps - warmup_steps) scheduled_lr *= max(offset, 0) trainer.set_learning_rate(scheduled_lr) if args.comm_backend == 'horovod' or args.comm_backend == 'byteps': # Note that horovod.trainer._scale is default to num_workers, # thus trainer.update(1) will scale the gradients by 1./num_workers. # *num_workers* of Horovod is the number of GPUs. trainer.update(1, ignore_stale_grad=True) else: # gluon.trainer._scale is default to 1. # *num_workers* of Trainer is the number of machines. trainer.update(num_workers, ignore_stale_grad=True) if num_accumulated > 1: # set grad to zero for gradient accumulation model.zero_grad() # saving if step_num % save_interval == 0 or step_num >= num_steps: states_option(step_num, trainer, args.ckpt_dir, local_rank, 'Saving') if local_rank == 0: parameters_option(step_num, model, args.ckpt_dir, 'Saving') # logging if step_num % log_interval == 0: running_mlm_loss /= log_interval running_nsp_loss /= log_interval toc = time.time() logging.info( '[step {}], Loss mlm/nsp={:.5f}/{:.3f}, Acc mlm/nsp={:.3f}/{:.3f}, ' ' LR={:.7f}, grad_norm={:.4f}. Time cost={:.2f} s,' ' Throughput={:.1f}K tks/s, ETA={:.2f} h'.format( step_num, running_mlm_loss, running_nsp_loss, mlm_metric.get()[1], nsp_metric.get()[1], trainer.learning_rate, total_norm, toc - tic, running_num_tks.asnumpy().item() / (toc - tic) / 1000, (num_steps - step_num) / (step_num / (toc - train_start_time)) / 3600)) mlm_metric.reset() nsp_metric.reset() tic = time.time() running_mlm_loss = 0 running_nsp_loss = 0 running_num_tks = 0 logging.info('Finish training step: %d', step_num) mx.npx.waitall() train_end_time = time.time() logging.info('Train cost={:.1f} s'.format(train_end_time - train_start_time)) if local_rank == 0: model_name = args.model_name.replace('google', 'gluon') save_dir = os.path.join(args.ckpt_dir, model_name) final_save(model, save_dir, tokenizer, cfg)
def train(args): store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm( args.comm_backend, args.gpus) task = get_task(args.task_name) #setup_logging(args, local_rank) level = logging.INFO detail_dir = os.path.join(args.output_dir, args.task_name) if not os.path.exists(detail_dir): os.mkdir(detail_dir) logging_config( detail_dir, name='train_{}_{}_'.format(args.task_name, args.model_name) + str(rank), # avoid race level=level, console=(local_rank == 0)) logging.info(args) cfg, tokenizer, classify_net, use_segmentation = \ get_network(args.model_name, ctx_l, args.param_checkpoint, args.backbone_path, task) logging.info('Prepare training data') train_data, _ = get_task_data(args, tokenizer, segment='train', task=task) train_batchify = bf.Group(bf.Group(bf.Pad(), bf.Pad(), bf.Stack()), bf.Stack()) epoch_num_updates = len(train_data) // args.batch_size max_update = epoch_num_updates * args.epochs warmup_steps = int(np.ceil(max_update * args.warmup_ratio)) dataloader = DataLoader(train_data, batch_size=args.batch_size, batchify_fn=train_batchify, num_workers=4, shuffle=True) dataloader = grouper(repeat(dataloader), len(ctx_l)) param_dict = classify_net.collect_params() # Do not apply weight decay to all the LayerNorm and bias for _, v in classify_net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 # Set grad_req if gradient accumulation is required params = [p for p in param_dict.values() if p.grad_req != 'null'] num_accumulated = args.num_accumulated if num_accumulated > 1: logging.info( 'Using gradient accumulation. Effective global batch size = {}'. format(num_accumulated * args.batch_size * len(ctx_l) * num_workers)) for p in params: p.grad_req = 'add' if args.comm_backend == 'horovod': # Horovod: fetch and broadcast parameters hvd.broadcast_parameters(param_dict, root_rank=0) lr_scheduler = PolyScheduler(max_update=max_update, base_lr=args.lr, warmup_begin_lr=0.0, pwr=1, final_lr=0.0, warmup_steps=warmup_steps, warmup_mode='linear') optimizer_params = { 'learning_rate': args.lr, 'wd': args.wd, 'lr_scheduler': lr_scheduler } if args.comm_backend == 'horovod': trainer = hvd.DistributedTrainer(param_dict, args.optimizer, optimizer_params) else: trainer = mx.gluon.Trainer(classify_net.collect_params(), 'adamw', optimizer_params) if args.task_name == 'sts': loss_function = gluon.loss.L2Loss() else: loss_function = gluon.loss.SoftmaxCELoss() #prepare loss function log_loss = 0 log_gnorm = 0 log_step = 0 if args.log_interval > 0: log_interval = args.log_interval else: log_interval = int(epoch_num_updates * 0.5) for i in range(max_update): sample_l = next(dataloader) loss_l = [] for sample, ctx in zip(sample_l, ctx_l): (token_ids, token_types, valid_length), label = sample # Move to the corresponding context token_ids = mx.np.array(token_ids, ctx=ctx) token_types = mx.np.array(token_types, ctx=ctx) valid_length = mx.np.array(valid_length, ctx=ctx) label = mx.np.array(label, ctx=ctx) with mx.autograd.record(): scores = classify_net(token_ids, token_types, valid_length) loss = loss_function(scores, label).mean() / len(ctx_l) loss_l.append(loss) for loss in loss_l: loss.backward() trainer.allreduce_grads() # Begin Norm Clipping total_norm, ratio, is_finite = clip_grad_global_norm( params, args.max_grad_norm) trainer.update(1.0) step_loss = sum([loss.asnumpy() for loss in loss_l]) log_loss += step_loss log_gnorm += total_norm log_step += 1 if log_step >= log_interval or i == max_update - 1: logging.info( '[Iter {} / {}] avg {} = {:.2f}, avg gradient norm = {:.2f}'. format(i + 1, max_update, 'nll', log_loss / log_step, log_gnorm / log_step)) log_loss = 0 log_gnorm = 0 log_step = 0 if local_rank == 0 and (i == max_update - 1 or i % (max_update // args.epochs) == 0 and i > 0): ckpt_name = '{}_{}_{}.params'.format(args.model_name, args.task_name, (i + 1)) params_saved = os.path.join(detail_dir, ckpt_name) classify_net.save_parameters(params_saved) logging.info('Params saved in: {}'.format(params_saved))
def parse_args(): parser = argparse.ArgumentParser( description='Transformer for Neural Machine Translation. Load a checkpoint and inference.') parser.add_argument('--seed', type=int, default=100, help='The random seed.') parser.add_argument('--src_lang', type=str, default='en', help='Source language') parser.add_argument('--tgt_lang', type=str, default='de', help='Target language') parser.add_argument('--src_corpus', type=str, required=True, help='The source corpus for evaluation.') parser.add_argument('--tgt_corpus', type=str, default=None, help='The target corpus for evaluation.') parser.add_argument('--src_normalizer', choices=['no', 'moses'], default='moses', help='The sentence normalizer that will be ' 'used to normalize the source sentence.') parser.add_argument('--src_base_tokenizer', choices=['whitespace', 'moses', 'no'], default='moses', help='The base tokenizer to tokenize the target ' 'sentence into a list of tokens.') parser.add_argument('--src_tokenizer', choices=['spm', 'subword_nmt', 'yttm', 'hf_bytebpe', 'hf_wordpiece', 'hf_bpe'], required=True, type=str, help='The source subword tokenizer. ' 'Only supports online encoding at present.') parser.add_argument('--tgt_normalizer', choices=['no', 'moses'], default='moses', help='The sentence normalizer that will be ' 'used to normalize the target sentence.') parser.add_argument('--tgt_base_tokenizer', choices=['whitespace', 'moses', 'no'], default='moses', help='The base tokenizer to tokenize the source ' 'sentence into a list of tokens.') parser.add_argument('--tgt_tokenizer', choices=['spm', 'subword_nmt', 'yttm', 'hf_bytebpe', 'hf_wordpiece', 'hf_bpe'], required=True, type=str, help='The target tokenizer. Only supports online encoding at present.') parser.add_argument('--src_subword_model_path', type=str, help='Path to the source subword model.') parser.add_argument('--src_vocab_path', type=str, help='Path to the source subword vocab.') parser.add_argument('--tgt_subword_model_path', type=str, help='Path to the target subword model.') parser.add_argument('--tgt_vocab_path', type=str, help='Path to the target subword vocab.') parser.add_argument('--src_max_len', type=int, default=None, help='Maximum length of the source sentence.') parser.add_argument('--tgt_max_len', type=int, default=None, help='Maximum length of the target sentence.') parser.add_argument('--cfg', type=str, help='Config file of the Transformer model.') parser.add_argument('--beam-size', type=int, default=4, help='Number of beams') parser.add_argument('--lp_alpha', type=float, default=0.6, help='The alpha value in the length penalty') parser.add_argument('--lp_k', type=int, default=5, help='The K value in the length penalty') parser.add_argument('--max_length_a', type=int, default=1, help='The a in the a * x + b formula of beam search') parser.add_argument('--max_length_b', type=int, default=50, help='The b in the a * x + b formula of beam search') parser.add_argument('--param_path', type=str, help='The path to the model parameters.') parser.add_argument('--gpus', type=str, default='0', help='List of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.' '(using single gpu is suggested)') parser.add_argument('--save_dir', type=str, default=None, help='The path to save the log files and predictions.') parser.add_argument('--stochastic', action='store_true', help='Whether to use the stochastic beam search') parser.add_argument('--temperature', type=float, default=None, help='the temperature used for softmax normalization with stochastic setting') parser.add_argument('--inference', action='store_true', help='Whether to inference with your own data, ' 'when applying inference, tgt_corpus is not needed and will be set to None.') parser.add_argument('--fp16', action='store_true', help='Whether to use dtype float16') args = parser.parse_args() if args.save_dir is None: args.save_dir = os.path.splitext(args.param_path)[0] + '_evaluation' assert args.inference or args.tgt_corpus, 'requring --tgt_corpus while not using --inference' if args.inference: args.tgt_corpus = None if args.stochastic: if args.temperature is None: args.temperature = 0.5 logging_config(args.save_dir, console=True) logging.info(args) return args
rtd_preds = mx.np.round((mx.np.sign(rtd_scores) + 1) / 2).astype(np.int32) mlm_accuracy = accuracy(unmasked_tokens, mlm_preds, masked_weights) corrupted_mlm_accuracy = accuracy(unmasked_tokens, corrupted_tokens, masked_weights) rtd_accuracy = accuracy(rtd_labels, rtd_preds, length_masks) rtd_precision = accuracy(rtd_labels, rtd_preds, length_masks * rtd_preds) rtd_recall = accuracy(rtd_labels, rtd_preds, rtd_labels * rtd_preds) rtd_auc = auc(rtd_labels, rtd_probs, length_masks) writer.add_scalars( 'results', { 'mlm_accuracy': mlm_accuracy.asnumpy().item(), 'corrupted_mlm_accuracy': corrupted_mlm_accuracy.asnumpy().item(), 'rtd_accuracy': rtd_accuracy.asnumpy().item(), 'rtd_precision': rtd_precision.asnumpy().item(), 'rtd_recall': rtd_recall.asnumpy().item(), 'rtd_auc': rtd_auc }, step_num) if __name__ == '__main__': os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' os.environ['MXNET_USE_FUSION'] = '0' # Manually disable pointwise fusion args = parse_args() logging_config(args.output_dir, name='pretrain_owt') logging.debug('Random seed set to {}'.format(args.seed)) logging.info(args) set_seed(args.seed) if args.do_train: train(args)
ckpt_candidates = [args.param_checkpoint] else: ckpt_candidates = [f for f in os.listdir(args.output_dir) if f.endswith('.params')] ckpt_candidates.sort(key=lambda ele: (len(ele), ele)) if last: ckpt_candidates = ckpt_candidates[-1:] best_eval = {} for ckpt_name in ckpt_candidates: logging.info('Starting evaluate the checkpoint {}'.format(ckpt_name)) ckpt_path = os.path.join(args.output_dir, ckpt_name) qa_net.load_parameters(ckpt_path, ctx=ctx_l, cast_dtype=True) best_eval = eval_validation(ckpt_name, best_eval) logging.info('The best evaluated results are {}'.format(json.dumps(best_eval))) output_eval_results_file = os.path.join(args.output_dir, 'best_results.json') with open(output_eval_results_file, 'w') as of: of.write(json.dumps(best_eval, indent=4) + '\n') return best_eval if __name__ == '__main__': os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' args = parse_args() logging_config(args.output_dir, name='finetune_squad{}'.format(args.version)) set_seed(args.seed) if args.do_train: train(args) if args.do_eval: evaluate(args, last=not args.all_evaluate)
def train(args): store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm( args.comm_backend, args.gpus) logging_config( args.output_dir, name='pretrain_owt_' + str(rank), # avoid race console=(local_rank == 0)) logging.info(args) logging.debug('Random seed set to {}'.format(args.seed)) set_seed(args.seed) logging.info('Training info: num_buckets: {}, ' 'num_workers: {}, rank: {}'.format(args.num_buckets, num_workers, rank)) cfg, tokenizer, model = get_pretraining_model(args.model_name, ctx_l, args.max_seq_length, args.hidden_dropout_prob, args.attention_dropout_prob, args.generator_units_scale, args.generator_layers_scale) data_masker = ElectraMasker(tokenizer, args.max_seq_length, mask_prob=args.mask_prob, replace_prob=args.replace_prob) if args.from_raw_text: if args.cached_file_path and not os.path.exists(args.cached_file_path): os.mkdir(args.cached_file_path) get_dataset_fn = functools.partial( get_pretrain_data_text, max_seq_length=args.max_seq_length, short_seq_prob=args.short_seq_prob, tokenizer=tokenizer, circle_length=args.circle_length, repeat=args.repeat, cached_file_path=args.cached_file_path) logging.info( 'Processing and loading the training dataset from raw text.') else: logging.info('Loading the training dataset from local Numpy file.') get_dataset_fn = get_pretrain_data_npz data_train = get_dataset_fn(args.data, args.batch_size, shuffle=True, num_buckets=args.num_buckets, vocab=tokenizer.vocab, num_parts=num_workers, part_idx=rank, num_dataset_workers=args.num_dataset_workers, num_batch_workers=args.num_batch_workers) logging.info('Creating distributed trainer...') param_dict = model.collect_params() # Do not apply weight decay to all the LayerNorm and bias for _, v in model.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 # Collect differentiable parameters params = [p for p in param_dict.values() if p.grad_req != 'null'] # Set grad_req if gradient accumulation is required num_accumulated = args.num_accumulated if num_accumulated > 1: logging.info( 'Using gradient accumulation. Effective global batch size = {}'. format(num_accumulated * args.batch_size * len(ctx_l) * num_workers)) for p in params: p.grad_req = 'add' # backend specific implementation if args.comm_backend == 'horovod': # Horovod: fetch and broadcast parameters hvd.broadcast_parameters(param_dict, root_rank=0) num_train_steps = args.num_train_steps if args.warmup_steps is not None: warmup_steps = args.warmup_steps else: warmup_steps = int(num_train_steps * args.warmup_ratio) assert warmup_steps is not None, 'Must specify either warmup_steps or warmup_ratio' log_interval = args.log_interval save_interval = args.save_interval if args.save_interval is not None\ else num_train_steps // 50 logging.info( '#Total Training Steps={}, Warmup={}, Save Interval={}'.format( num_train_steps, warmup_steps, save_interval)) lr_scheduler = PolyScheduler(max_update=num_train_steps, base_lr=args.lr, warmup_begin_lr=0, pwr=1, final_lr=0, warmup_steps=warmup_steps, warmup_mode='linear') optimizer_params = { 'learning_rate': args.lr, 'wd': args.wd, 'lr_scheduler': lr_scheduler, } if args.optimizer == 'adamw': optimizer_params.update({ 'beta1': 0.9, 'beta2': 0.999, 'epsilon': 1e-6, 'correct_bias': False, }) if args.comm_backend == 'horovod': trainer = hvd.DistributedTrainer(param_dict, args.optimizer, optimizer_params) else: trainer = mx.gluon.Trainer(param_dict, args.optimizer, optimizer_params, update_on_kvstore=False) if args.start_step: logging.info('Restart training from {}'.format(args.start_step)) # TODO(zheyuye), How about data splitting, where to start re-training state_path = states_option(args.start_step, trainer, args.output_dir, local_rank, 'Loading') param_path = parameters_option(args.start_step, model, args.output_dir, 'Loading') # prepare the loss function mlm_loss_fn = mx.gluon.loss.SoftmaxCELoss() rtd_loss_fn = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss() mlm_loss_fn.hybridize() rtd_loss_fn.hybridize() # prepare the records writer writer = None # only one process on each worker will write the tensorboardX's records to avoid race if args.do_eval and local_rank == 0: from tensorboardX import SummaryWriter record_path = os.path.join(args.output_dir, 'records') logging.info('Evaluation records saved in {}'.format(record_path)) writer = SummaryWriter(record_path) step_num = args.start_step finish_flag = False log_total_loss = 0 log_mlm_loss = 0 log_rtd_loss = 0 log_sample_num = 0 train_start_time = time.time() # start training train_loop_dataloader = grouper(repeat(data_train), len(ctx_l)) while step_num < num_train_steps: tic = time.time() for accum_idx in range(num_accumulated): sample_l = next(train_loop_dataloader) loss_l = [] mlm_loss_l = [] rtd_loss_l = [] for sample, ctx in zip(sample_l, ctx_l): if sample is None: continue # prepare data input_ids, segment_ids, valid_lengths = sample input_ids = input_ids.as_in_ctx(ctx) segment_ids = segment_ids.as_in_ctx(ctx) valid_lengths = valid_lengths.as_in_ctx(ctx) masked_input = data_masker.dynamic_masking( mx.nd, input_ids, valid_lengths) masked_input_ids = masked_input.input_ids length_masks = masked_input.masks unmasked_tokens = masked_input.unmasked_tokens masked_positions = masked_input.masked_positions masked_weights = masked_input.masked_weights log_sample_num += len(masked_input_ids) with mx.autograd.record(): mlm_scores, rtd_scores, corrupted_tokens, labels = model( masked_input_ids, segment_ids, valid_lengths, unmasked_tokens, masked_positions) denominator = (masked_weights.sum() + 1e-6) * num_accumulated * len(ctx_l) mlm_loss = mlm_loss_fn( mx.npx.reshape(mlm_scores, (-5, -1)), unmasked_tokens.reshape( (-1, )), masked_weights.reshape( (-1, 1))).sum() / denominator denominator = (length_masks.sum() + 1e-6) * num_accumulated * len(ctx_l) rtd_loss = rtd_loss_fn(rtd_scores, labels, length_masks).sum() / denominator output = ElectraOutput( mlm_scores=mlm_scores, rtd_scores=rtd_scores, rtd_labels=labels, corrupted_tokens=corrupted_tokens, ) mlm_loss_l.append(mlm_loss) rtd_loss_l.append(rtd_loss) loss = (args.gen_weight * mlm_loss + args.disc_weight * rtd_loss) loss_l.append(loss) for loss in loss_l: loss.backward() # All Reduce the Step Loss log_mlm_loss += sum( [ele.as_in_ctx(ctx_l[0]) for ele in mlm_loss_l]).asnumpy() log_rtd_loss += sum( [ele.as_in_ctx(ctx_l[0]) for ele in rtd_loss_l]).asnumpy() log_total_loss += sum([ele.as_in_ctx(ctx_l[0]) for ele in loss_l]).asnumpy() # update trainer.allreduce_grads() total_norm, ratio, is_finite = clip_grad_global_norm( params, args.max_grad_norm * num_workers) if args.comm_backend == 'horovod': # Note that horovod.trainer._scale is default to num_workers, # thus trainer.update(1) will scale the gradients by 1./num_workers trainer.update(1, ignore_stale_grad=True) else: # gluon.trainer._scale is default to 1 trainer.update(num_workers, ignore_stale_grad=True) total_norm = total_norm / num_workers step_num += 1 if num_accumulated > 1: # set grad to zero for gradient accumulation model.zero_grad() # saving if step_num % save_interval == 0 or step_num >= num_train_steps: if is_master_node: states_option(step_num, trainer, args.output_dir, local_rank, 'Saving') if local_rank == 0: param_path = parameters_option(step_num, model, args.output_dir, 'Saving') # logging if step_num % log_interval == 0: # Output the loss of per step log_mlm_loss /= log_interval log_rtd_loss /= log_interval log_total_loss /= log_interval toc = time.time() logging.info('[step {}], Loss mlm/rtd/total={:.4f}/{:.4f}/{:.4f},' ' LR={:.6f}, grad_norm={:.4f}. Time cost={:.2f},' ' Throughput={:.2f} samples/s, ETA={:.2f}h'.format( step_num, log_mlm_loss, log_rtd_loss, log_total_loss, trainer.learning_rate, total_norm, toc - tic, log_sample_num / (toc - tic), (num_train_steps - step_num) / (step_num / (toc - train_start_time)) / 3600)) tic = time.time() if args.do_eval: evaluation(writer, step_num, masked_input, output) if writer is not None: writer.add_scalars( 'loss', { 'total_loss': log_total_loss, 'mlm_loss': log_mlm_loss, 'rtd_loss': log_rtd_loss }, step_num) log_mlm_loss = 0 log_rtd_loss = 0 log_total_loss = 0 log_sample_num = 0 logging.info('Finish training step: %d', step_num) if is_master_node: state_path = states_option(step_num, trainer, args.output_dir, local_rank, 'Saving') if local_rank == 0: param_path = parameters_option(step_num, model, args.output_dir, 'Saving') mx.npx.waitall() train_end_time = time.time() logging.info('Train cost={:.1f}s'.format(train_end_time - train_start_time)) if writer is not None: writer.close() if local_rank == 0: model_name = args.model_name.replace('google', 'gluon') save_dir = os.path.join(args.output_dir, model_name) final_save(model, save_dir, tokenizer)
def evaluate(args): store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm( args.comm_backend, args.gpus) # setup_logging(args, local_rank) task = get_task(args.task_name, args.train_dir, args.eval_dir) level = logging.INFO detail_dir = os.path.join(args.output_dir, args.task_name) if not os.path.exists(detail_dir): os.mkdir(detail_dir) logging_config( detail_dir, name='train_{}_{}_'.format(args.task_name, args.model_name) + str(rank), # avoid race level=level, console=(local_rank == 0)) if rank != 0: logging.info('Skipping node {}'.format(rank)) return ctx_l = parse_ctx(args.gpus) logging.info( 'Srarting inference without horovod on the first node on device {}'. format(str(ctx_l))) cfg, tokenizer, classify_net, use_segmentation = \ get_network(args.model_name, ctx_l, args.param_checkpoint, args.backbone_path, task) candidate_ckpt = [] detail_dir = os.path.join(args.output_dir, args.task_name) for name in os.listdir(detail_dir): if name.endswith( '.params' ) and args.task_name in name and args.model_name in name: candidate_ckpt.append(os.path.join(detail_dir, name)) best_ckpt = {} metrics = task.metric def evaluate_by_ckpt(ckpt_name, best_ckpt): classify_net.load_parameters(ckpt_name, ctx=ctx_l, cast_dtype=True) logging.info('Prepare dev data') dev_data, label = get_task_data(args, task, tokenizer, segment='eval') dev_batchify = bf.Group(bf.Group(bf.Pad(), bf.Pad(), bf.Stack()), bf.Stack()) dataloader = DataLoader(dev_data, batch_size=args.batch_size, batchify_fn=dev_batchify, shuffle=False) for sample_l in grouper(dataloader, len(ctx_l)): for sample, ctx in zip(sample_l, ctx_l): if sample is None: continue (token_ids, token_types, valid_length), label = sample token_ids = mx.np.array(token_ids, ctx=ctx) token_types = mx.np.array(token_types, ctx=ctx) valid_length = mx.np.array(valid_length, ctx=ctx) scores = classify_net(token_ids, token_types, valid_length) if task.task_name == 'sts': label = label.reshape((-1, 1)) for metric in metrics: metric.update([label], [scores]) #pred.append(scores) for metric in metrics: metric_name, result = metric.get() logging.info('checkpoint {} get result: {}:{}'.format( ckpt_name, metric_name, result)) if best_ckpt.get(metric_name, [0, ''])[0] < result: best_ckpt[metric_name] = [result, ckpt_name] for metric in metrics: metric.reset() for ckpt_name in candidate_ckpt: evaluate_by_ckpt(ckpt_name, best_ckpt) for metric_name in best_ckpt: logging.info( 'best result on metric {}: is {}, and on checkpoint {}'.format( metric_name, best_ckpt[metric_name][0], best_ckpt[metric_name][1]))