def generic_train( model: BaseTransformer, args: argparse.Namespace, logging_callback=None, checkpoint_callback=None, extra_callbacks=[], logger=True, # can pass WandbLogger() here **extra_train_kwargs): pl.seed_everything(args.seed) # init model odir = Path(model.hparams.output_dir) odir.mkdir(exist_ok=True) # add custom checkpoints if checkpoint_callback is None: checkpoint_callback = pl.callbacks.ModelCheckpoint( filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1) if logging_callback is None: logging_callback = LoggingCallback() train_params = {} train_params["limit_val_batches"] = 2 # TODO: remove with PyTorch 1.6 since pl uses native amp if args.fp16: train_params["precision"] = 16 train_params["amp_level"] = args.amp_level if args.gpus > 1 or args.num_nodes > 1: train_params["distributed_backend"] = "ddp" train_params["accelerator"] = "ddp" trainer = pl.Trainer.from_argparse_args( args, weights_summary=None, callbacks=[logging_callback] + extra_callbacks, logger=logger, checkpoint_callback=checkpoint_callback, **train_params, ) if args.affinity != 'disabled': affinity = set_affinity(get_rank(), get_device_count(), args.affinity) print(f'{get_rank()}: thread affinity: {affinity}') if args.do_train: trainer.fit(model) return trainer
import torch from pytorch_lightning import Trainer, seed_everything from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, early_stopping from data_loading.data_module import DataModule from models.nn_unet import NNUnet from utils.gpu_affinity import set_affinity from utils.logger import LoggingCallback from utils.utils import get_main_args, is_main_process, log, make_empty_dir, set_cuda_devices, verify_ckpt_path if __name__ == "__main__": args = get_main_args() if args.affinity != "disabled": affinity = set_affinity(os.getenv("LOCAL_RANK", "0"), args.affinity) set_cuda_devices(args) seed_everything(args.seed) data_module = DataModule(args) data_module.prepare_data() data_module.setup() ckpt_path = verify_ckpt_path(args) callbacks = None model_ckpt = None if args.benchmark: model = NNUnet(args) batch_size = args.batch_size if args.exec_mode == "train" else args.val_batch_size log_dir = os.path.join(args.results, args.logname if args.logname is not None else "perf.json") callbacks = [
def main(args): ## Distributed computing # utility for synchronization def reduce_tensor(tensor): rt = tensor.clone() torch.distributed.all_reduce(rt, op = torch.distributed.ReduceOp.SUM) return rt # enable distributed computing if args.distributed: set_affinity(args.local_rank) num_devices = torch.cuda.device_count() torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend = 'nccl', init_method = 'env://') world_size = torch.distributed.get_world_size() #os.environ['WORLD_SIZE'] print('num_devices', num_devices, 'local_rank', args.local_rank, 'world_size', world_size) else: # if not args.distributed: num_devices, world_size = 1, 1 ## Model preparation (Conv-LSTM or Conv-TT-LSTM) # construct the model with the specified hyper-parameters model = ConvLSTMNet( input_channels = args.img_channels, output_sigmoid = args.use_sigmoid, # model architecture layers_per_block = (3, 3, 3, 3), hidden_channels = (32, 48, 48, 32), skip_stride = 2, # convolutional tensor-train layers cell = args.model, cell_params = { "order": args.model_order, "steps": args.model_steps, "ranks": args.model_ranks}, # convolutional parameters kernel_size = args.kernel_size).cuda() if args.distributed: if args.use_apex: # use DDP from apex.parallel from apex.parallel import DistributedDataParallel as DDP model = DDP(model, delay_allreduce = True) else: # use DDP from nn.parallel from torch.nn.parallel import DistributedDataParallel as DDP model = DDP(model, device_ids = [args.local_rank]) PSmodel = PSmodels.PerceptualLoss(model = 'net-lin', net = 'alex', use_gpu = True, gpu_ids = [args.local_rank]) ## Dataset Preparation (KTH, UCF, tinyUCF) Dataset = {"KTH": KTH_Dataset, "MNIST": MNIST_Dataset}[args.dataset] DATA_DIR = os.path.join("../data", {"MNIST": "mnist", "KTH": "kth"}[args.dataset]) # batch size for each process total_batch_size = args.batch_size assert total_batch_size % world_size == 0, \ 'The batch_size is not divisible by world_size.' batch_size = total_batch_size // world_size total_frames = args.input_frames + args.future_frames # dataloaer for the valiation dataset test_data_path = os.path.join(DATA_DIR, args.test_data_file) assert os.path.exists(test_data_path), \ "The test dataset does not exist. "+test_data_path test_dataset = Dataset({"path": test_data_path, "unique_mode": True, "num_frames": total_frames, "num_samples": args.test_samples, "height": args.img_height, "width": args.img_width, "channels": args.img_channels, 'training': False}) test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas = world_size, rank = args.local_rank, shuffle = False) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size = batch_size, drop_last = True, num_workers = num_devices * 4, pin_memory = True, sampler = test_sampler) test_samples = len(test_loader) * total_batch_size print(test_samples) ## Main script for test phase MSE_ = torch.zeros((args.future_frames), dtype = torch.float32).cuda() PSNR_ = torch.zeros((args.future_frames), dtype = torch.float32).cuda() SSIM_ = torch.zeros((args.future_frames), dtype = torch.float32).cuda() PIPS_ = torch.zeros((args.future_frames), dtype = torch.float32).cuda() with torch.no_grad(): model.eval() for it, frames in enumerate(test_loader): frames = frames.permute(0, 1, 4, 2, 3).cuda() inputs = frames[:, :args.input_frames] origin = frames[:, -args.future_frames:] pred = model(inputs, input_frames = args.input_frames, future_frames = args.future_frames, output_frames = args.future_frames, teacher_forcing = False) # accumlate the statistics per frame for t in range(-args.future_frames, 0): origin_, pred_ = origin[:, t], pred[:, t] if args.img_channels == 1: origin_ = origin_.repeat([1, 3, 1, 1]) pred_ = pred_.repeat([1, 3, 1, 1]) dist = PSmodel(origin_, pred_) PIPS_[t] += torch.sum(dist).item() origin = origin.permute(0, 1, 3, 4, 2).cpu().numpy() pred = pred.permute(0, 1, 3, 4, 2).cpu().numpy() for t in range(-args.future_frames, 0): for i in range(batch_size): origin_, pred_ = origin[i, t], pred[i, t] if args.img_channels == 1: origin_ = np.squeeze(origin_, axis = -1) pred_ = np.squeeze(pred_, axis = -1) MSE_[t] += skimage.metrics.mean_squared_error(origin_, pred_) PSNR_[t] += skimage.metrics.peak_signal_noise_ratio(origin_, pred_) SSIM_[t] += skimage.metrics.structural_similarity(origin_, pred_, multichannel = args.img_channels > 1) if args.distributed: MSE = reduce_tensor( MSE_) / test_samples PSNR = reduce_tensor(PSNR_) / test_samples SSIM = reduce_tensor(SSIM_) / test_samples PIPS = reduce_tensor(PIPS_) / test_samples else: # if not args.distributed: MSE = MSE_ / test_samples PSNR = PSNR_ / test_samples SSIM = SSIM_ / test_samples PIPS = PIPS_ / test_samples if args.local_rank == 0: print("MSE: {} (x1e-3)\nPSNR: {}\nSSIM: {}\nLPIPS: {}".format( 1e3 * torch.mean(MSE).cpu().item(), torch.mean(PSNR).cpu().item(), torch.mean(SSIM).cpu().item(), torch.mean(PIPS).cpu().item())) print( "MSE:", MSE.cpu().numpy()) print("PSNR:", PSNR.cpu().numpy()) print("SSIM:", SSIM.cpu().numpy()) print("PIPS:", PIPS.cpu().numpy())
from nnunet.nn_unet import NNUnet from utils.args import get_main_args from utils.gpu_affinity import set_affinity from utils.logger import LoggingCallback from utils.utils import make_empty_dir, set_cuda_devices, verify_ckpt_path if __name__ == "__main__": args = get_main_args() if args.profile: nvidia_dlprof_pytorch_nvtx.init() print("Profiling enabled") if args.affinity != "disabled": affinity = set_affinity(int(os.getenv("LOCAL_RANK", "0")), args.gpus, mode=args.affinity) # Limit number of CPU threads os.environ["OMP_NUM_THREADS"] = "1" # Set device limit on the current device cudaLimitMaxL2FetchGranularity = 0x05 _libcudart = ctypes.CDLL("libcudart.so") pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) assert pValue.contents.value == 128 set_cuda_devices(args) seed_everything(args.seed) data_module = DataModule(args) data_module.prepare_data()
def main(args): ## Distributed computing # utility for synchronization def reduce_tensor(tensor, reduce_sum=False): rt = tensor.clone() torch.distributed.all_reduce(rt, op=torch.distributed.ReduceOp.SUM) return rt if reduce_sum else (rt / world_size) # enable distributed computing if args.distributed: set_affinity(args.local_rank) num_devices = torch.cuda.device_count() torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') world_size = torch.distributed.get_world_size( ) #os.environ['WORLD_SIZE'] print('num_devices', num_devices, 'local_rank', args.local_rank, 'world_size', world_size) else: num_devices, world_size = 1, 1 ## Model preparation (Conv-LSTM or Conv-TT-LSTM) # construct the model with the specified hyper-parameters model = ConvLSTMNet( input_channels=args.img_channels, output_sigmoid=args.use_sigmoid, # model architecture layers_per_block=(3, 3, 3, 3), hidden_channels=(32, 48, 48, 32), skip_stride=2, # convolutional tensor-train layers cell=args.model, cell_params={ "order": args.model_order, "steps": args.model_steps, "ranks": args.model_ranks }, # convolutional parameters kernel_size=args.kernel_size).cuda() ## Dataset Preparation (KTH, MNIST) Dataset = {"KTH": KTH_Dataset, "MNIST": MNIST_Dataset}[args.dataset] DATA_DIR = os.path.join("../data", { "MNIST": "mnist", "KTH": "kth" }[args.dataset]) # batch size for each process total_batch_size = args.batch_size assert total_batch_size % world_size == 0, \ 'The batch_size is not divisible by world_size.' batch_size = total_batch_size // world_size # dataloader for the training dataset train_data_path = os.path.join(DATA_DIR, args.train_data_file) train_dataset = Dataset({ "path": train_data_path, "unique_mode": False, "num_frames": args.input_frames + args.future_frames, "num_samples": args.train_samples, "height": args.img_height, "width": args.img_width, "channels": args.img_channels, 'training': True }) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=world_size, rank=args.local_rank, shuffle=True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, drop_last=True, num_workers=num_devices * 4, pin_memory=True, sampler=train_sampler) train_samples = len(train_loader) * total_batch_size # dataloaer for the valiation dataset valid_data_path = os.path.join(DATA_DIR, args.valid_data_file) valid_dataset = Dataset({ "path": valid_data_path, "unique_mode": True, "num_frames": args.input_frames + args.future_frames, "num_samples": args.valid_samples, "height": args.img_height, "width": args.img_width, "channels": args.img_channels, 'training': False }) valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_dataset, num_replicas=world_size, rank=args.local_rank, shuffle=False) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, drop_last=True, num_workers=num_devices * 4, pin_memory=True, sampler=valid_sampler) valid_samples = len(valid_loader) * total_batch_size # tensorboardX for logging learning curve if args.local_rank == 0: tensorboard = SummaryWriter() ## Main script for training and validation # loss function for training loss_func = lambda outputs, targets: \ F.l1_loss(outputs, targets) + F.mse_loss(outputs, targets) # intialize the scheduled sampling ratio scheduled_sampling_ratio = 1 ssr_decay_start = args.ssr_decay_start ssr_decay_mode = False # initialize the learning rate learning_rate = args.learning_rate lr_decay_start = args.num_epochs lr_decay_mode = False # best model in validation loss min_epoch, min_loss = 0, float("inf") ## Main script for training and validation if args.use_fused: from apex.optimizers import FusedAdam optimizer = FusedAdam(model.parameters(), lr=learning_rate) else: # if not args.use_fused: optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if args.use_amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if args.distributed: if args.use_apex: # use DDP from apex.parallel from apex.parallel import DistributedDataParallel as DDP model = DDP(model, delay_allreduce=True) else: # use DDP from nn.parallel from torch.nn.parallel import DistributedDataParallel as DDP model = DDP(model, device_ids=[args.local_rank]) for epoch in range(args.num_epochs): ## Phase 1: Learning on the training set model.train() samples = 0 for frames in train_loader: samples += total_batch_size frames = frames.permute(0, 1, 4, 2, 3).cuda() inputs = frames[:, :-1] origin = frames[:, -args.output_frames:] pred = model(inputs, input_frames=args.input_frames, future_frames=args.future_frames, output_frames=args.output_frames, teacher_forcing=True, scheduled_sampling_ratio=scheduled_sampling_ratio, checkpointing=args.use_checkpointing) loss = loss_func(pred, origin) if args.distributed: reduced_loss = reduce_tensor(loss.data) else: # if not args.distributed: reduced_loss = loss.data optimizer.zero_grad() if args.use_amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if args.gradient_clipping: grad_norm = nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.clipping_threshold) else: # if not args.use_amp: loss.backward() if args.gradient_clipping: grad_norm = nn.utils.clip_grad_norm_( model.parameters(), args.clipping_threshold) optimizer.step() if args.local_rank == 0: print('Epoch: {}/{}, Training: {}/{}, Loss: {}'.format( epoch + 1, args.num_epochs, samples, train_samples, reduced_loss.item())) ## Phase 2: Evaluation on the validation set model.eval() with torch.no_grad(): samples, LOSS = 0., 0. for it, frames in enumerate(valid_loader): samples += total_batch_size frames = frames.permute(0, 1, 4, 2, 3).cuda() inputs = frames[:, :args.input_frames] origin = frames[:, -args.output_frames:] pred = model(inputs, input_frames=args.input_frames, future_frames=args.future_frames, output_frames=args.output_frames, teacher_forcing=False, checkpointing=False) loss = loss_func(pred, origin) if args.distributed: reduced_loss = reduce_tensor(loss.data) else: # if not args.distributed: reduced_loss = loss.data LOSS += reduced_loss.item() * total_batch_size LOSS /= valid_samples if args.local_rank == 0: tensorboard.add_scalar("LOSS", LOSS, epoch + 1) if LOSS < min_loss: min_epoch, min_loss = epoch + 1, LOSS ## Phase 3: learning rate and scheduling sampling ratio adjustment if not ssr_decay_mode and epoch > ssr_decay_start \ and epoch > min_epoch + args.decay_log_epochs: ssr_decay_mode = True lr_decay_start = epoch + args.lr_decay_start if not lr_decay_mode and epoch > lr_decay_start \ and epoch > min_epoch + args.decay_log_epochs: lr_decay_mode = True if ssr_decay_mode and (epoch + 1) % args.ssr_decay_epoch == 0: scheduled_sampling_ratio = max( scheduled_sampling_ratio - args.ssr_decay_ratio, 0) if lr_decay_mode and (epoch + 1) % args.lr_decay_epoch == 0: for param_group in optimizer.param_groups: param_group['lr'] *= args.lr_decay_rate if args.local_rank == 0: torch.save(model.state_dict(), "checkpoint.pt")
def main(_): setup_xla_flags() tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path) if FLAGS.horovod: hvd.init() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.io.gfile.makedirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) master_process = True training_hooks = [] global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps hvd_rank = 0 config = tf.compat.v1.ConfigProto() if FLAGS.horovod: tf.compat.v1.logging.info("Multi-GPU training with TF Horovod") tf.compat.v1.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank()) global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size() master_process = (hvd.rank() == 0) hvd_rank = hvd.rank() config.gpu_options.visible_device_list = str(hvd.local_rank()) set_affinity(hvd.local_rank()) if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 if FLAGS.amp: tf.enable_resource_variables() run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir if master_process else None, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, save_summary_steps=FLAGS.save_checkpoints_steps if master_process else None, log_step_count_steps=FLAGS.display_loss_steps, keep_checkpoint_max=1) if master_process: tf.compat.v1.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format(key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") train_examples = None num_train_steps = None num_warmup_steps = None training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank, FLAGS.save_checkpoints_steps, num_steps_ignore_xla=25)) if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / global_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) start_index = 0 end_index = len(train_examples) tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")] if FLAGS.horovod: tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size())] num_examples_per_rank = len(train_examples) // hvd.size() remainder = len(train_examples) % hvd.size() if hvd.rank() < remainder: start_index = hvd.rank() * (num_examples_per_rank+1) end_index = start_index + num_examples_per_rank + 1 else: start_index = hvd.rank() * num_examples_per_rank + remainder end_index = start_index + (num_examples_per_rank) model_fn = model_fn_builder( task_name=task_name, bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.horovod else hvd) estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config) if FLAGS.do_train: file_based_convert_examples_to_features( train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank]) tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Num examples = %d", len(train_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.compat.v1.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=tmp_filenames, batch_size=FLAGS.train_batch_size, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, hvd=None if not FLAGS.horovod else hvd) train_start_time = time.time() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks) train_time_elapsed = time.time() - train_start_time train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = (training_hooks[-1].count - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead if master_process: tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, num_train_steps * global_batch_size) tf.compat.v1.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (training_hooks[-1].count - training_hooks[-1].skipped) * global_batch_size) tf.compat.v1.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) tf.compat.v1.logging.info("-----------------------------") if FLAGS.do_eval and master_process: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info(" Num examples = %d", len(eval_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_drop_remainder = False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, batch_size=FLAGS.eval_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)] eval_start_time = time.time() result = estimator.evaluate(input_fn=eval_input_fn, hooks=eval_hooks) eval_time_elapsed = time.time() - eval_start_time time_list = eval_hooks[-1].time_list time_list.sort() # Removing outliers (init/warmup) in throughput computation. eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.8)]) num_sentences = (int(len(time_list) * 0.8)) * FLAGS.eval_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size) tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, num_sentences) tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set") tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32") tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): dllogging.logger.log(step=(), data={key: float(result[key])}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict and master_process: predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.compat.v1.logging.info("***** Running prediction*****") tf.compat.v1.logging.info(" Num examples = %d", len(predict_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, batch_size=FLAGS.predict_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) predict_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)] predict_start_time = time.time() output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.io.gfile.GFile(output_predict_file, "w") as writer: tf.compat.v1.logging.info("***** Predict results *****") for prediction in estimator.predict(input_fn=predict_input_fn, hooks=predict_hooks, yield_single_examples=False): output_line = "\t".join( str(class_probability) for class_probability in prediction) + "\n" writer.write(output_line) predict_time_elapsed = time.time() - predict_start_time time_list = predict_hooks[-1].time_list time_list.sort() # Removing outliers (init/warmup) in throughput computation. predict_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.8)]) num_sentences = (int(len(time_list) * 0.8)) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / predict_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", predict_time_elapsed, predict_hooks[-1].count * FLAGS.predict_batch_size) tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", predict_time_wo_overhead, num_sentences) tf.compat.v1.logging.info("Summary Inference Statistics on TEST SET") tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32") tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------")
def main(_): setup_xla_flags() tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if FLAGS.horovod: import horovod.tensorflow as hvd hvd.init() bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.io.gfile.makedirs(FLAGS.output_dir) input_files = [] for input_file_dir in FLAGS.input_files_dir.split(","): input_files.extend(tf.io.gfile.glob(os.path.join(input_file_dir, "*"))) if FLAGS.horovod and len(input_files) < hvd.size(): raise ValueError("Input Files must be sharded") if FLAGS.amp and FLAGS.manual_fp16: raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error") is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 config = tf.compat.v1.ConfigProto() if FLAGS.horovod: config.gpu_options.visible_device_list = str(hvd.local_rank()) set_affinity(hvd.local_rank()) if hvd.rank() == 0: tf.compat.v1.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format(key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") # config.gpu_options.per_process_gpu_memory_fraction = 0.7 if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT if FLAGS.amp: tf.enable_resource_variables() run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None, save_summary_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None, # This variable controls how often estimator reports examples/sec. # Default value is every 100 steps. # When --report_loss is True, we set to very large value to prevent # default info reporting from estimator. # Ideally we should set it to None, but that does not work. log_step_count_steps=10000 if FLAGS.report_loss else 100) model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate*hvd.size(), num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.horovod else hvd) estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config) if FLAGS.do_train: training_hooks = [] if FLAGS.horovod and hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if (not FLAGS.horovod or hvd.rank() == 0): global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size() training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, dllogging, FLAGS.display_loss_steps, FLAGS.save_checkpoints_steps, FLAGS.report_loss)) tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, batch_size=FLAGS.train_batch_size, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, hvd=None if not FLAGS.horovod else hvd) train_start_time = time.time() estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps) train_time_elapsed = time.time() - train_start_time if (not FLAGS.horovod or hvd.rank() == 0): train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = FLAGS.num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = (FLAGS.num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, FLAGS.num_train_steps * global_batch_size) tf.compat.v1.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (FLAGS.num_train_steps - training_hooks[-1].skipped) * global_batch_size) tf.compat.v1.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_train": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0): tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_files = [] for eval_file_dir in FLAGS.eval_files_dir.split(","): eval_files.extend(tf.io.gfile.glob(os.path.join(eval_file_dir, "*"))) eval_input_fn = input_fn_builder( input_files=eval_files, batch_size=FLAGS.eval_batch_size, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, hvd=None if not FLAGS.horovod else hvd) eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)] eval_start_time = time.time() result = estimator.evaluate( input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks) eval_time_elapsed = time.time() - eval_start_time time_list = eval_hooks[-1].time_list time_list.sort() # Removing outliers (init/warmup) in throughput computation. eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)]) num_sentences = (int(len(time_list) * 0.99)) * FLAGS.eval_batch_size ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size) tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, num_sentences) tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set") tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32") tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
arg( "--type", type=str, choices=["pre", "post"], help="Type of task to run; pre - localization, post - damage assesment", ) arg("--seed", type=int, default=1) parser = Model.add_model_specific_args(parser) args = parser.parse_args() if args.interpolate: args.deep_supervision = False args.dec_interp = False set_cuda_devices(args.gpus) affinity = set_affinity(os.getenv("LOCAL_RANK", "0"), "socket_unique_interleaved") seed_everything(args.seed) data_module = DataModule(args) callbacks = None checkpoint = args.ckpt if args.ckpt is not None and os.path.exists( args.ckpt) else None if args.exec_mode == "train": model = Model(args) model_ckpt = ModelCheckpoint(monitor="f1_score", mode="max", save_last=True) callbacks = [ EarlyStopping(monitor="f1_score", patience=args.patience, verbose=True,
def main(): setup_default_logging() ## TODO(sugh) replace args, args_text = _parse_args() set_affinity(args.local_rank) random.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) args.prefetcher = not args.no_prefetcher args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank if args.distributed: torch.cuda.manual_seed_all(args.seed) args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() # Set device limit on the current device # cudaLimitMaxL2FetchGranularity = 0x05 pValue = ctypes.cast((ctypes.c_int*1)(), ctypes.POINTER(ctypes.c_int)) _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) assert pValue.contents.value == 128 assert args.rank >= 0 setup_dllogger(args.rank, filename=args.dllogger_file) if args.distributed: logging.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (args.rank, args.world_size)) else: logging.info('Training with a single process on 1 GPU.') if args.waymo: if (args.waymo_train is not None and args.waymo_val is None) or (args.waymo_train is None and args.waymo_val is not None): raise Exception("waymo_train or waymo_val is not set") memory_format = ( torch.channels_last if args.memory_format == "nhwc" else torch.contiguous_format ) model = create_model( args.model, input_size=args.input_size, num_classes=args.num_classes, bench_task='train', pretrained=args.pretrained, pretrained_backbone_path=args.pretrained_backbone_path, redundant_bias=args.redundant_bias, checkpoint_path=args.initial_checkpoint, label_smoothing=args.smoothing, fused_focal_loss=args.fused_focal_loss, remove_params=args.remove_weights, freeze_layers=args.freeze_layers, strict_load=False ) # FIXME decide which args to keep and overlay on config / pass to backbone # num_classes=args.num_classes, input_size = model.config.image_size data_config = model.config print("Input size to be passed to dataloaders: {}".format(input_size)) print("Image size used in model: {}".format(model.config.image_size)) if args.rank == 0: dllogger.log(step='PARAMETER', data={'model_name':args.model, 'param_count': sum([m.numel() for m in model.parameters()])}) model = model.cuda().to(memory_format=memory_format) # # optionally resume from a checkpoint if args.distributed: if args.sync_bn: try: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if args.local_rank == 0: logging.info( 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using ' 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.') except Exception as e: logging.error('Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1') optimizer = create_optimizer(args, model) scaler = torch.cuda.amp.GradScaler(enabled=args.amp) resume_state = {} resume_epoch = None output_base = args.output if args.output else './output' resume_checkpoint_path = get_latest_checkpoint(os.path.join(output_base, 'train')) if args.resume and resume_checkpoint_path is not None: print("Trying to load checkpoint from {}".format(resume_checkpoint_path)) resume_state, resume_epoch = resume_checkpoint(unwrap_bench(model), resume_checkpoint_path) if resume_epoch is not None: print("Resume training from {} epoch".format(resume_epoch)) if resume_state and not args.no_resume_opt: if 'optimizer' in resume_state: if args.local_rank == 0: logging.info('Restoring Optimizer state from checkpoint') optimizer.load_state_dict(resume_state['optimizer']) if args.amp and 'scaler' in resume_state: if args.local_rank == 0: logging.info('Restoring NVIDIA AMP state from checkpoint') scaler.load_state_dict(resume_state['scaler']) del resume_state model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper if args.resume and resume_checkpoint_path is not None: resume_path = resume_checkpoint_path else: resume_path = '' model_ema = ModelEma( model, decay=args.model_ema_decay, resume=resume_path) if args.distributed: if args.local_rank == 0: logging.info("Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP.") model = DDP(model, device_ids=[args.device]) # can use device str in Torch >= 1.1 # NOTE: EMA model does not need to be wrapped by DDP lr_scheduler, num_epochs = create_scheduler(args, optimizer) start_epoch = 0 if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: dllogger.log(step="PARAMETER", data={'Scheduled_epochs': num_epochs}, verbosity=0) # Benchmark will always override every other setting. if args.benchmark: start_epoch = 0 num_epochs = args.epochs if args.waymo: train_annotation_path = args.waymo_train_annotation train_image_dir = args.waymo_train else: train_anno_set = 'train2017' train_annotation_path = os.path.join(args.data, 'annotations', f'instances_{train_anno_set}.json') train_image_dir = train_anno_set dataset_train = CocoDetection(os.path.join(args.data, train_image_dir), train_annotation_path, data_config) loader_train = create_loader( dataset_train, input_size=input_size, batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, interpolation=args.train_interpolation, num_workers=args.workers, distributed=args.distributed, pin_mem=args.pin_mem, memory_format=memory_format ) loader_train_iter = iter(loader_train) steps_per_epoch = int(np.ceil( len(dataset_train) / (args.world_size * args.batch_size) )) if args.waymo: val_annotation_path = args.waymo_val_annotation val_image_dir = args.waymo_val else: val_anno_set = 'val2017' val_annotation_path = os.path.join(args.data, 'annotations', f'instances_{val_anno_set}.json') val_image_dir = val_anno_set dataset_eval = CocoDetection(os.path.join(args.data, val_image_dir), val_annotation_path, data_config) loader_eval = create_loader( dataset_eval, input_size=input_size, batch_size=args.validation_batch_size_multiplier * args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=args.interpolation, num_workers=args.workers, distributed=args.distributed, pin_mem=args.pin_mem, memory_format=memory_format ) evaluator = COCOEvaluator(dataset_eval.coco, distributed=args.distributed, waymo=args.waymo) eval_metric = args.eval_metric eval_metrics = None train_metrics = {} best_metric = -1 is_best = False best_epoch = None saver = None output_dir = '' if args.rank == 0: output_base = args.output if args.output else './output' output_dir = get_outdirectory(output_base, 'train') decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(checkpoint_dir=output_dir) with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: f.write(args_text) try: for epoch in range(start_epoch, num_epochs): if args.distributed: loader_train.sampler.set_epoch(epoch) train_metrics = train_epoch( epoch, steps_per_epoch, model, loader_train_iter, optimizer, args, lr_scheduler=lr_scheduler, output_dir=output_dir, use_amp=args.amp, scaler=scaler, model_ema=model_ema) if args.distributed and args.dist_bn in ('broadcast', 'reduce'): if args.local_rank == 0: logging.info("Distributing BatchNorm running means and vars") distribute_bn(model, args.world_size, args.dist_bn == 'reduce') # the overhead of evaluating with coco style datasets is fairly high, so just ema or non, not both if model_ema is not None: if args.distributed and args.dist_bn in ('broadcast', 'reduce'): distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') if epoch >= args.eval_after: eval_metrics = validate(model_ema.ema, loader_eval, args, evaluator, epoch, log_suffix=' (EMA)') else: eval_metrics = validate(model, loader_eval, args, evaluator, epoch) lr_scheduler.step(epoch + 1) if saver is not None and args.rank == 0 and epoch % args.save_checkpoint_interval == 0: if eval_metrics is not None: # save proper checkpoint with eval metric is_best = eval_metrics[eval_metric] > best_metric best_metric = max( eval_metrics[eval_metric], best_metric ) best_epoch = epoch else: is_best = False best_metric = 0 saver.save_checkpoint(model, optimizer, epoch, model_ema=model_ema, metric=best_metric, is_best=is_best) except KeyboardInterrupt: dllogger.flush() torch.cuda.empty_cache() if best_metric > 0: train_metrics.update({'best_map': best_metric, 'best_epoch': best_epoch}) if eval_metrics is not None: train_metrics.update(eval_metrics) dllogger.log(step=(), data=train_metrics, verbosity=0)
def main(_): setup_xla_flags() tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path) if FLAGS.horovod: hvd.init() processors = { "bc5cdr": BC5CDRProcessor, "clefe": CLEFEProcessor, 'i2b2': I2b22012Processor } if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) tf.io.gfile.makedirs(FLAGS.output_dir) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 master_process = True training_hooks = [] global_batch_size = FLAGS.train_batch_size hvd_rank = 0 config = tf.compat.v1.ConfigProto() if FLAGS.horovod: global_batch_size = FLAGS.train_batch_size * hvd.size() master_process = (hvd.rank() == 0) hvd_rank = hvd.rank() config.gpu_options.visible_device_list = str(hvd.local_rank()) set_affinity(hvd.local_rank()) if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 if FLAGS.amp: tf.enable_resource_variables() run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir if master_process else None, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, keep_checkpoint_max=1) if master_process: tf.compat.v1.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format( key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") train_examples = None num_train_steps = None num_warmup_steps = None training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank)) if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / global_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) start_index = 0 end_index = len(train_examples) tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")] if FLAGS.horovod: tmp_filenames = [ os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size()) ] num_examples_per_rank = len(train_examples) // hvd.size() remainder = len(train_examples) % hvd.size() if hvd.rank() < remainder: start_index = hvd.rank() * (num_examples_per_rank + 1) end_index = start_index + num_examples_per_rank + 1 else: start_index = hvd.rank() * num_examples_per_rank + remainder end_index = start_index + (num_examples_per_rank) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.horovod else hvd, amp=FLAGS.amp) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if FLAGS.do_train: #train_file = os.path.join(FLAGS.output_dir, "train.tf_record") #filed_based_convert_examples_to_features( # train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) filed_based_convert_examples_to_features( train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank]) tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Num examples = %d", len(train_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.compat.v1.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=tmp_filenames, #train_file, batch_size=FLAGS.train_batch_size, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, hvd=None if not FLAGS.horovod else hvd) #estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) train_start_time = time.time() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks) train_time_elapsed = time.time() - train_start_time train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = ( num_train_steps - training_hooks[-1].skipped ) * global_batch_size * 1.0 / train_time_wo_overhead if master_process: tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, num_train_steps * global_batch_size) tf.compat.v1.logging.info( "Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (num_train_steps - training_hooks[-1].skipped) * global_batch_size) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.compat.v1.logging.info( "Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log( step=(), data={"throughput_train": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") if FLAGS.do_eval and master_process: eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info(" Num examples = %d", len(eval_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None eval_drop_remainder = False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, batch_size=FLAGS.eval_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) dllogging.logger.log(step=(), data={key: float(str(result[key]))}, verbosity=Verbosity.DEFAULT) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict and master_process: predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test") with tf.io.gfile.GFile(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} token_path = os.path.join(FLAGS.output_dir, "token_test.txt") if tf.io.gfile.exists(token_path): tf.io.gfile.remove(token_path) tf.compat.v1.logging.info("***** Running prediction*****") tf.compat.v1.logging.info(" Num examples = %d", len(predict_examples)) tf.compat.v1.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, batch_size=FLAGS.predict_batch_size, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)] eval_start_time = time.time() output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") test_labels_file = os.path.join(FLAGS.output_dir, "test_labels.txt") test_labels_err_file = os.path.join(FLAGS.output_dir, "test_labels_errs.txt") with tf.io.gfile.GFile(output_predict_file, 'w') as writer, \ tf.io.gfile.GFile(test_labels_file, 'w') as tl, \ tf.io.gfile.GFile(test_labels_err_file, 'w') as tle: print(id2label) i = 0 for prediction in estimator.predict(input_fn=predict_input_fn, hooks=eval_hooks, yield_single_examples=True): output_line = "\n".join(id2label[id] for id in prediction if id != 0) + "\n" writer.write(output_line) result_to_pair(predict_examples[i], prediction, id2label, tl, tle) i = i + 1 eval_time_elapsed = time.time() - eval_start_time time_list = eval_hooks[-1].time_list time_list.sort() # Removing outliers (init/warmup) in throughput computation. eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)]) num_sentences = (int(len(time_list) * 0.99)) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info( "Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.predict_batch_size) tf.compat.v1.logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, num_sentences) tf.compat.v1.logging.info("Summary Inference Statistics") tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32") tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000) tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info('Reading: %s', test_labels_file) with tf.io.gfile.GFile(test_labels_file, "r") as f: counts = evaluate(f) eval_result = report_notprint(counts) print(''.join(eval_result)) with tf.io.gfile.GFile( os.path.join(FLAGS.output_dir, 'test_results_conlleval.txt'), 'w') as fd: fd.write(''.join(eval_result))