Example #1
0
def generic_train(
        model: BaseTransformer,
        args: argparse.Namespace,
        logging_callback=None,
        checkpoint_callback=None,
        extra_callbacks=[],
        logger=True,  # can pass WandbLogger() here
        **extra_train_kwargs):
    pl.seed_everything(args.seed)

    # init model
    odir = Path(model.hparams.output_dir)
    odir.mkdir(exist_ok=True)

    # add custom checkpoints
    if checkpoint_callback is None:
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            filepath=args.output_dir,
            prefix="checkpoint",
            monitor="val_loss",
            mode="min",
            save_top_k=1)
    if logging_callback is None:
        logging_callback = LoggingCallback()

    train_params = {}

    train_params["limit_val_batches"] = 2

    # TODO: remove with PyTorch 1.6 since pl uses native amp
    if args.fp16:
        train_params["precision"] = 16
        train_params["amp_level"] = args.amp_level

    if args.gpus > 1 or args.num_nodes > 1:
        train_params["distributed_backend"] = "ddp"
        train_params["accelerator"] = "ddp"

    trainer = pl.Trainer.from_argparse_args(
        args,
        weights_summary=None,
        callbacks=[logging_callback] + extra_callbacks,
        logger=logger,
        checkpoint_callback=checkpoint_callback,
        **train_params,
    )

    if args.affinity != 'disabled':
        affinity = set_affinity(get_rank(), get_device_count(), args.affinity)
        print(f'{get_rank()}: thread affinity: {affinity}')

    if args.do_train:
        trainer.fit(model)

    return trainer
Example #2
0
import torch
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, early_stopping

from data_loading.data_module import DataModule
from models.nn_unet import NNUnet
from utils.gpu_affinity import set_affinity
from utils.logger import LoggingCallback
from utils.utils import get_main_args, is_main_process, log, make_empty_dir, set_cuda_devices, verify_ckpt_path

if __name__ == "__main__":
    args = get_main_args()

    if args.affinity != "disabled":
        affinity = set_affinity(os.getenv("LOCAL_RANK", "0"), args.affinity)

    set_cuda_devices(args)
    seed_everything(args.seed)
    data_module = DataModule(args)
    data_module.prepare_data()
    data_module.setup()
    ckpt_path = verify_ckpt_path(args)

    callbacks = None
    model_ckpt = None
    if args.benchmark:
        model = NNUnet(args)
        batch_size = args.batch_size if args.exec_mode == "train" else args.val_batch_size
        log_dir = os.path.join(args.results, args.logname if args.logname is not None else "perf.json")
        callbacks = [
Example #3
0
def main(args):
    ## Distributed computing

    # utility for synchronization
    def reduce_tensor(tensor):
        rt = tensor.clone()
        torch.distributed.all_reduce(rt, op = torch.distributed.ReduceOp.SUM)
        return rt

    # enable distributed computing
    if args.distributed:
        set_affinity(args.local_rank)
        num_devices = torch.cuda.device_count()
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend = 'nccl', init_method = 'env://')

        world_size  = torch.distributed.get_world_size() #os.environ['WORLD_SIZE']
        print('num_devices', num_devices, 
              'local_rank', args.local_rank, 
              'world_size', world_size)
    else: # if not args.distributed:
        num_devices, world_size = 1, 1

    ## Model preparation (Conv-LSTM or Conv-TT-LSTM)

    # construct the model with the specified hyper-parameters
    model = ConvLSTMNet(
        input_channels = args.img_channels, 
        output_sigmoid = args.use_sigmoid,
        # model architecture
        layers_per_block = (3, 3, 3, 3), 
        hidden_channels  = (32, 48, 48, 32), 
        skip_stride = 2,
        # convolutional tensor-train layers
        cell = args.model,
        cell_params = {
            "order": args.model_order, 
            "steps": args.model_steps, 
            "ranks": args.model_ranks},
        # convolutional parameters
        kernel_size = args.kernel_size).cuda()

    if args.distributed:
        if args.use_apex: # use DDP from apex.parallel
            from apex.parallel import DistributedDataParallel as DDP
            model = DDP(model, delay_allreduce = True)
        else: # use DDP from nn.parallel
            from torch.nn.parallel import DistributedDataParallel as DDP
            model = DDP(model, device_ids = [args.local_rank])

    PSmodel = PSmodels.PerceptualLoss(model = 'net-lin', 
        net = 'alex', use_gpu = True, gpu_ids = [args.local_rank])

    ## Dataset Preparation (KTH, UCF, tinyUCF)
    Dataset = {"KTH": KTH_Dataset, "MNIST": MNIST_Dataset}[args.dataset]

    DATA_DIR = os.path.join("../data", 
        {"MNIST": "mnist", "KTH": "kth"}[args.dataset])

    # batch size for each process
    total_batch_size  = args.batch_size
    assert total_batch_size % world_size == 0, \
        'The batch_size is not divisible by world_size.'
    batch_size = total_batch_size // world_size

    total_frames = args.input_frames + args.future_frames

    # dataloaer for the valiation dataset 
    test_data_path = os.path.join(DATA_DIR, args.test_data_file)
    assert os.path.exists(test_data_path), \
        "The test dataset does not exist. "+test_data_path

    test_dataset = Dataset({"path": test_data_path, 
        "unique_mode": True, "num_frames": total_frames, "num_samples": args.test_samples,
        "height": args.img_height, "width": args.img_width, "channels": args.img_channels, 'training': False})

    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_dataset, num_replicas = world_size, rank = args.local_rank, shuffle = False)
    test_loader  = torch.utils.data.DataLoader(
        test_dataset, batch_size = batch_size, drop_last = True, 
        num_workers = num_devices * 4, pin_memory = True, sampler = test_sampler)

    test_samples = len(test_loader) * total_batch_size
    print(test_samples)

    ## Main script for test phase 
    MSE_  = torch.zeros((args.future_frames), dtype = torch.float32).cuda()
    PSNR_ = torch.zeros((args.future_frames), dtype = torch.float32).cuda()
    SSIM_ = torch.zeros((args.future_frames), dtype = torch.float32).cuda()
    PIPS_ = torch.zeros((args.future_frames), dtype = torch.float32).cuda()

    with torch.no_grad():
        model.eval()

        for it, frames in enumerate(test_loader):

            frames = frames.permute(0, 1, 4, 2, 3).cuda()
            inputs = frames[:,  :args.input_frames]
            origin = frames[:, -args.future_frames:]

            pred = model(inputs, 
                input_frames  =  args.input_frames, 
                future_frames = args.future_frames, 
                output_frames = args.future_frames, 
                teacher_forcing = False)

            # accumlate the statistics per frame
            for t in range(-args.future_frames, 0):
                origin_, pred_ = origin[:, t], pred[:, t]

                if args.img_channels == 1:
                    origin_ = origin_.repeat([1, 3, 1, 1])
                    pred_   =   pred_.repeat([1, 3, 1, 1])

                dist = PSmodel(origin_, pred_)
                PIPS_[t] += torch.sum(dist).item()

            origin = origin.permute(0, 1, 3, 4, 2).cpu().numpy()
            pred   =   pred.permute(0, 1, 3, 4, 2).cpu().numpy()

            for t in range(-args.future_frames, 0):
                for i in range(batch_size):
                    origin_, pred_ = origin[i, t], pred[i, t]

                    if args.img_channels == 1:
                        origin_ = np.squeeze(origin_, axis = -1)
                        pred_   = np.squeeze(pred_,   axis = -1)

                    MSE_[t]  += skimage.metrics.mean_squared_error(origin_, pred_)
                    PSNR_[t] += skimage.metrics.peak_signal_noise_ratio(origin_, pred_)
                    SSIM_[t] += skimage.metrics.structural_similarity(origin_, pred_, multichannel = args.img_channels > 1)

        if args.distributed:
            MSE  = reduce_tensor( MSE_) / test_samples
            PSNR = reduce_tensor(PSNR_) / test_samples
            SSIM = reduce_tensor(SSIM_) / test_samples
            PIPS = reduce_tensor(PIPS_) / test_samples
        else: # if not args.distributed:
            MSE  = MSE_  / test_samples
            PSNR = PSNR_ / test_samples
            SSIM = SSIM_ / test_samples
            PIPS = PIPS_ / test_samples

    if args.local_rank == 0:
        print("MSE: {} (x1e-3)\nPSNR: {}\nSSIM: {}\nLPIPS: {}".format(
            1e3 * torch.mean(MSE).cpu().item(), torch.mean(PSNR).cpu().item(), 
            torch.mean(SSIM).cpu().item(), torch.mean(PIPS).cpu().item()))

    print( "MSE:",  MSE.cpu().numpy())
    print("PSNR:", PSNR.cpu().numpy())
    print("SSIM:", SSIM.cpu().numpy())
    print("PIPS:", PIPS.cpu().numpy())
Example #4
0
from nnunet.nn_unet import NNUnet
from utils.args import get_main_args
from utils.gpu_affinity import set_affinity
from utils.logger import LoggingCallback
from utils.utils import make_empty_dir, set_cuda_devices, verify_ckpt_path

if __name__ == "__main__":
    args = get_main_args()

    if args.profile:
        nvidia_dlprof_pytorch_nvtx.init()
        print("Profiling enabled")

    if args.affinity != "disabled":
        affinity = set_affinity(int(os.getenv("LOCAL_RANK", "0")),
                                args.gpus,
                                mode=args.affinity)

    # Limit number of CPU threads
    os.environ["OMP_NUM_THREADS"] = "1"
    # Set device limit on the current device cudaLimitMaxL2FetchGranularity = 0x05
    _libcudart = ctypes.CDLL("libcudart.so")
    pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
    _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
    _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05))
    assert pValue.contents.value == 128

    set_cuda_devices(args)
    seed_everything(args.seed)
    data_module = DataModule(args)
    data_module.prepare_data()
Example #5
0
def main(args):
    ## Distributed computing

    # utility for synchronization
    def reduce_tensor(tensor, reduce_sum=False):
        rt = tensor.clone()
        torch.distributed.all_reduce(rt, op=torch.distributed.ReduceOp.SUM)
        return rt if reduce_sum else (rt / world_size)

    # enable distributed computing
    if args.distributed:
        set_affinity(args.local_rank)
        num_devices = torch.cuda.device_count()
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

        world_size = torch.distributed.get_world_size(
        )  #os.environ['WORLD_SIZE']
        print('num_devices', num_devices, 'local_rank', args.local_rank,
              'world_size', world_size)
    else:
        num_devices, world_size = 1, 1

    ## Model preparation (Conv-LSTM or Conv-TT-LSTM)

    # construct the model with the specified hyper-parameters
    model = ConvLSTMNet(
        input_channels=args.img_channels,
        output_sigmoid=args.use_sigmoid,
        # model architecture
        layers_per_block=(3, 3, 3, 3),
        hidden_channels=(32, 48, 48, 32),
        skip_stride=2,
        # convolutional tensor-train layers
        cell=args.model,
        cell_params={
            "order": args.model_order,
            "steps": args.model_steps,
            "ranks": args.model_ranks
        },
        # convolutional parameters
        kernel_size=args.kernel_size).cuda()

    ## Dataset Preparation (KTH, MNIST)
    Dataset = {"KTH": KTH_Dataset, "MNIST": MNIST_Dataset}[args.dataset]

    DATA_DIR = os.path.join("../data", {
        "MNIST": "mnist",
        "KTH": "kth"
    }[args.dataset])

    # batch size for each process
    total_batch_size = args.batch_size
    assert total_batch_size % world_size == 0, \
        'The batch_size is not divisible by world_size.'
    batch_size = total_batch_size // world_size

    # dataloader for the training dataset
    train_data_path = os.path.join(DATA_DIR, args.train_data_file)

    train_dataset = Dataset({
        "path": train_data_path,
        "unique_mode": False,
        "num_frames": args.input_frames + args.future_frames,
        "num_samples": args.train_samples,
        "height": args.img_height,
        "width": args.img_width,
        "channels": args.img_channels,
        'training': True
    })

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset,
        num_replicas=world_size,
        rank=args.local_rank,
        shuffle=True)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               drop_last=True,
                                               num_workers=num_devices * 4,
                                               pin_memory=True,
                                               sampler=train_sampler)

    train_samples = len(train_loader) * total_batch_size

    # dataloaer for the valiation dataset
    valid_data_path = os.path.join(DATA_DIR, args.valid_data_file)

    valid_dataset = Dataset({
        "path": valid_data_path,
        "unique_mode": True,
        "num_frames": args.input_frames + args.future_frames,
        "num_samples": args.valid_samples,
        "height": args.img_height,
        "width": args.img_width,
        "channels": args.img_channels,
        'training': False
    })

    valid_sampler = torch.utils.data.distributed.DistributedSampler(
        valid_dataset,
        num_replicas=world_size,
        rank=args.local_rank,
        shuffle=False)
    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=batch_size,
                                               drop_last=True,
                                               num_workers=num_devices * 4,
                                               pin_memory=True,
                                               sampler=valid_sampler)

    valid_samples = len(valid_loader) * total_batch_size

    # tensorboardX for logging learning curve
    if args.local_rank == 0:
        tensorboard = SummaryWriter()

    ## Main script for training and validation

    # loss function for training
    loss_func = lambda outputs, targets: \
        F.l1_loss(outputs, targets) + F.mse_loss(outputs, targets)

    # intialize the scheduled sampling ratio
    scheduled_sampling_ratio = 1
    ssr_decay_start = args.ssr_decay_start
    ssr_decay_mode = False

    # initialize the learning rate
    learning_rate = args.learning_rate
    lr_decay_start = args.num_epochs
    lr_decay_mode = False

    # best model in validation loss
    min_epoch, min_loss = 0, float("inf")

    ## Main script for training and validation
    if args.use_fused:
        from apex.optimizers import FusedAdam
        optimizer = FusedAdam(model.parameters(), lr=learning_rate)
    else:  # if not args.use_fused:
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if args.use_amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    if args.distributed:
        if args.use_apex:  # use DDP from apex.parallel
            from apex.parallel import DistributedDataParallel as DDP
            model = DDP(model, delay_allreduce=True)
        else:  # use DDP from nn.parallel
            from torch.nn.parallel import DistributedDataParallel as DDP
            model = DDP(model, device_ids=[args.local_rank])

    for epoch in range(args.num_epochs):

        ## Phase 1: Learning on the training set
        model.train()

        samples = 0
        for frames in train_loader:
            samples += total_batch_size

            frames = frames.permute(0, 1, 4, 2, 3).cuda()
            inputs = frames[:, :-1]
            origin = frames[:, -args.output_frames:]

            pred = model(inputs,
                         input_frames=args.input_frames,
                         future_frames=args.future_frames,
                         output_frames=args.output_frames,
                         teacher_forcing=True,
                         scheduled_sampling_ratio=scheduled_sampling_ratio,
                         checkpointing=args.use_checkpointing)

            loss = loss_func(pred, origin)

            if args.distributed:
                reduced_loss = reduce_tensor(loss.data)
            else:  # if not args.distributed:
                reduced_loss = loss.data

            optimizer.zero_grad()

            if args.use_amp:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                if args.gradient_clipping:
                    grad_norm = nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.clipping_threshold)
            else:  # if not args.use_amp:
                loss.backward()
                if args.gradient_clipping:
                    grad_norm = nn.utils.clip_grad_norm_(
                        model.parameters(), args.clipping_threshold)

            optimizer.step()

            if args.local_rank == 0:
                print('Epoch: {}/{}, Training: {}/{}, Loss: {}'.format(
                    epoch + 1, args.num_epochs, samples, train_samples,
                    reduced_loss.item()))

        ## Phase 2: Evaluation on the validation set
        model.eval()

        with torch.no_grad():
            samples, LOSS = 0., 0.
            for it, frames in enumerate(valid_loader):
                samples += total_batch_size

                frames = frames.permute(0, 1, 4, 2, 3).cuda()
                inputs = frames[:, :args.input_frames]
                origin = frames[:, -args.output_frames:]

                pred = model(inputs,
                             input_frames=args.input_frames,
                             future_frames=args.future_frames,
                             output_frames=args.output_frames,
                             teacher_forcing=False,
                             checkpointing=False)

                loss = loss_func(pred, origin)

                if args.distributed:
                    reduced_loss = reduce_tensor(loss.data)
                else:  # if not args.distributed:
                    reduced_loss = loss.data

                LOSS += reduced_loss.item() * total_batch_size

            LOSS /= valid_samples

            if args.local_rank == 0:
                tensorboard.add_scalar("LOSS", LOSS, epoch + 1)

            if LOSS < min_loss:
                min_epoch, min_loss = epoch + 1, LOSS

        ## Phase 3: learning rate and scheduling sampling ratio adjustment
        if not ssr_decay_mode and epoch > ssr_decay_start \
            and epoch > min_epoch + args.decay_log_epochs:
            ssr_decay_mode = True
            lr_decay_start = epoch + args.lr_decay_start

        if not  lr_decay_mode and epoch >  lr_decay_start \
            and epoch > min_epoch + args.decay_log_epochs:
            lr_decay_mode = True

        if ssr_decay_mode and (epoch + 1) % args.ssr_decay_epoch == 0:
            scheduled_sampling_ratio = max(
                scheduled_sampling_ratio - args.ssr_decay_ratio, 0)

        if lr_decay_mode and (epoch + 1) % args.lr_decay_epoch == 0:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= args.lr_decay_rate

    if args.local_rank == 0:
        torch.save(model.state_dict(), "checkpoint.pt")
def main(_):

  setup_xla_flags()

  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
  dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)

  if FLAGS.horovod:
    hvd.init()

  processors = {
      "cola": ColaProcessor,
      "mnli": MnliProcessor,
      "mrpc": MrpcProcessor,
      "xnli": XnliProcessor,
  }

  if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
    raise ValueError(
        "At least one of `do_train`, `do_eval` or `do_predict' must be True.")

  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  if FLAGS.max_seq_length > bert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length %d because the BERT model "
        "was only trained up to sequence length %d" %
        (FLAGS.max_seq_length, bert_config.max_position_embeddings))

  tf.io.gfile.makedirs(FLAGS.output_dir)

  task_name = FLAGS.task_name.lower()

  if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))

  processor = processors[task_name]()

  label_list = processor.get_labels()

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  master_process = True
  training_hooks = []
  global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps
  hvd_rank = 0

  config = tf.compat.v1.ConfigProto()
  if FLAGS.horovod:

      tf.compat.v1.logging.info("Multi-GPU training with TF Horovod")
      tf.compat.v1.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank())
      global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
      master_process = (hvd.rank() == 0)
      hvd_rank = hvd.rank()
      config.gpu_options.visible_device_list = str(hvd.local_rank())
      set_affinity(hvd.local_rank())
      if hvd.size() > 1:
          training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
  if FLAGS.use_xla:
    config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
    if FLAGS.amp:
      tf.enable_resource_variables()

  run_config = tf.estimator.RunConfig(
      model_dir=FLAGS.output_dir if master_process else None,
      session_config=config,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
      save_summary_steps=FLAGS.save_checkpoints_steps if master_process else None,
      log_step_count_steps=FLAGS.display_loss_steps,
      keep_checkpoint_max=1)

  if master_process:
      tf.compat.v1.logging.info("***** Configuaration *****")
      for key in FLAGS.__flags.keys():
          tf.compat.v1.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
      tf.compat.v1.logging.info("**************************")

  train_examples = None
  num_train_steps = None
  num_warmup_steps = None
  training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank, FLAGS.save_checkpoints_steps, num_steps_ignore_xla=25))

  if FLAGS.do_train:
    train_examples = processor.get_train_examples(FLAGS.data_dir)
    num_train_steps = int(
        len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    start_index = 0
    end_index = len(train_examples)
    tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]

    if FLAGS.horovod:
      tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size())]
      num_examples_per_rank = len(train_examples) // hvd.size()
      remainder = len(train_examples) % hvd.size()
      if hvd.rank() < remainder:
        start_index = hvd.rank() * (num_examples_per_rank+1)
        end_index = start_index + num_examples_per_rank + 1
      else:
        start_index = hvd.rank() * num_examples_per_rank + remainder
        end_index = start_index + (num_examples_per_rank)

  model_fn = model_fn_builder(
      task_name=task_name,
      bert_config=bert_config,
      num_labels=len(label_list),
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(),
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_one_hot_embeddings=False,
      hvd=None if not FLAGS.horovod else hvd)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config)

  if FLAGS.do_train:

    file_based_convert_examples_to_features(
        train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])

    tf.compat.v1.logging.info("***** Running training *****")
    tf.compat.v1.logging.info("  Num examples = %d", len(train_examples))
    tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.compat.v1.logging.info("  Num steps = %d", num_train_steps)
    train_input_fn = file_based_input_fn_builder(
        input_file=tmp_filenames,
        batch_size=FLAGS.train_batch_size,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True,
        hvd=None if not FLAGS.horovod else hvd)

    train_start_time = time.time()
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks)
    train_time_elapsed = time.time() - train_start_time
    train_time_wo_overhead = training_hooks[-1].total_time
    avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
    ss_sentences_per_second = (training_hooks[-1].count - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead

    if master_process:
        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed,
                        num_train_steps * global_batch_size)
        tf.compat.v1.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead,
                        (training_hooks[-1].count - training_hooks[-1].skipped) * global_batch_size)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
        tf.compat.v1.logging.info("-----------------------------")

  if FLAGS.do_eval and master_process:
    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
    file_based_convert_examples_to_features(
        eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)

    tf.compat.v1.logging.info("***** Running evaluation *****")
    tf.compat.v1.logging.info("  Num examples = %d", len(eval_examples))
    tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

    eval_drop_remainder = False
    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        batch_size=FLAGS.eval_batch_size,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=eval_drop_remainder)

    eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
    eval_start_time = time.time()
    result = estimator.evaluate(input_fn=eval_input_fn, hooks=eval_hooks)

    eval_time_elapsed = time.time() - eval_start_time

    time_list = eval_hooks[-1].time_list
    time_list.sort()
    # Removing outliers (init/warmup) in throughput computation.
    eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.8)])
    num_sentences = (int(len(time_list) * 0.8)) * FLAGS.eval_batch_size

    avg = np.mean(time_list)
    cf_50 = max(time_list[:int(len(time_list) * 0.50)])
    cf_90 = max(time_list[:int(len(time_list) * 0.90)])
    cf_95 = max(time_list[:int(len(time_list) * 0.95)])
    cf_99 = max(time_list[:int(len(time_list) * 0.99)])
    cf_100 = max(time_list[:int(len(time_list) * 1)])
    ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

    tf.compat.v1.logging.info("-----------------------------")
    tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
                    eval_hooks[-1].count * FLAGS.eval_batch_size)
    tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
                    num_sentences)
    tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set")
    tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size)
    tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
    tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32")
    tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
    tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
    tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
    tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000)
    tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000)
    tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
    tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
    dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
    tf.compat.v1.logging.info("-----------------------------")


    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
    with tf.io.gfile.GFile(output_eval_file, "w") as writer:
      tf.compat.v1.logging.info("***** Eval results *****")
      for key in sorted(result.keys()):
        dllogging.logger.log(step=(), data={key: float(result[key])}, verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))

  if FLAGS.do_predict and master_process:
    predict_examples = processor.get_test_examples(FLAGS.data_dir)
    predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
    file_based_convert_examples_to_features(predict_examples, label_list,
                                            FLAGS.max_seq_length, tokenizer,
                                            predict_file)

    tf.compat.v1.logging.info("***** Running prediction*****")
    tf.compat.v1.logging.info("  Num examples = %d", len(predict_examples))
    tf.compat.v1.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

    predict_drop_remainder = False
    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        batch_size=FLAGS.predict_batch_size,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=predict_drop_remainder)

    predict_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
    predict_start_time = time.time()

    output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
    with tf.io.gfile.GFile(output_predict_file, "w") as writer:
        tf.compat.v1.logging.info("***** Predict results *****")
        for prediction in estimator.predict(input_fn=predict_input_fn, hooks=predict_hooks,
                                            yield_single_examples=False):
            output_line = "\t".join(
                str(class_probability) for class_probability in prediction) + "\n"
            writer.write(output_line)


    predict_time_elapsed = time.time() - predict_start_time

    time_list = predict_hooks[-1].time_list
    time_list.sort()
    # Removing outliers (init/warmup) in throughput computation.
    predict_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.8)])
    num_sentences = (int(len(time_list) * 0.8)) * FLAGS.predict_batch_size

    avg = np.mean(time_list)
    cf_50 = max(time_list[:int(len(time_list) * 0.50)])
    cf_90 = max(time_list[:int(len(time_list) * 0.90)])
    cf_95 = max(time_list[:int(len(time_list) * 0.95)])
    cf_99 = max(time_list[:int(len(time_list) * 0.99)])
    cf_100 = max(time_list[:int(len(time_list) * 1)])
    ss_sentences_per_second = num_sentences * 1.0 / predict_time_wo_overhead

    tf.compat.v1.logging.info("-----------------------------")
    tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", predict_time_elapsed,
                    predict_hooks[-1].count * FLAGS.predict_batch_size)
    tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", predict_time_wo_overhead,
                              num_sentences)
    tf.compat.v1.logging.info("Summary Inference Statistics on TEST SET")
    tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
    tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
    tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32")
    tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
    tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
    tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
    tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000)
    tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000)
    tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
    tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
    dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
    tf.compat.v1.logging.info("-----------------------------")
Example #7
0
def main(_):
  setup_xla_flags()

  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
  dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)

  if not FLAGS.do_train and not FLAGS.do_eval:
    raise ValueError("At least one of `do_train` or `do_eval` must be True.")

  if FLAGS.horovod:
    import horovod.tensorflow as hvd
    hvd.init()

  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  tf.io.gfile.makedirs(FLAGS.output_dir)

  input_files = []
  for input_file_dir in FLAGS.input_files_dir.split(","):
    input_files.extend(tf.io.gfile.glob(os.path.join(input_file_dir, "*")))

  if FLAGS.horovod and len(input_files) < hvd.size():
      raise ValueError("Input Files must be sharded")
  if FLAGS.amp and FLAGS.manual_fp16:
      raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  config = tf.compat.v1.ConfigProto()
  if FLAGS.horovod:
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    set_affinity(hvd.local_rank())
    if hvd.rank() == 0:
      tf.compat.v1.logging.info("***** Configuaration *****")
      for key in FLAGS.__flags.keys():
          tf.compat.v1.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
      tf.compat.v1.logging.info("**************************")

#    config.gpu_options.per_process_gpu_memory_fraction = 0.7
  if FLAGS.use_xla: 
      config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
      config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT
      if FLAGS.amp:
        tf.enable_resource_variables()

  run_config = tf.estimator.RunConfig(
      model_dir=FLAGS.output_dir,
      session_config=config,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None,
      save_summary_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None,
      # This variable controls how often estimator reports examples/sec.
      # Default value is every 100 steps.
      # When --report_loss is True, we set to very large value to prevent
      # default info reporting from estimator.
      # Ideally we should set it to None, but that does not work.
      log_step_count_steps=10000 if FLAGS.report_loss else 100)

  model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate*hvd.size(),
      num_train_steps=FLAGS.num_train_steps,
      num_warmup_steps=FLAGS.num_warmup_steps,
      use_one_hot_embeddings=False,
      hvd=None if not FLAGS.horovod else hvd)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config)

  if FLAGS.do_train:

    training_hooks = []
    if FLAGS.horovod and hvd.size() > 1:
      training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
    if (not FLAGS.horovod or hvd.rank() == 0):
      global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
      training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, dllogging, FLAGS.display_loss_steps, FLAGS.save_checkpoints_steps, FLAGS.report_loss))

    tf.compat.v1.logging.info("***** Running training *****")
    tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    train_input_fn = input_fn_builder(
        input_files=input_files,
        batch_size=FLAGS.train_batch_size,
        max_seq_length=FLAGS.max_seq_length,
        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
        is_training=True,
        hvd=None if not FLAGS.horovod else hvd)

    train_start_time = time.time()
    estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps)
    train_time_elapsed = time.time() - train_start_time

    if (not FLAGS.horovod or hvd.rank() == 0):
        train_time_wo_overhead = training_hooks[-1].total_time
        avg_sentences_per_second = FLAGS.num_train_steps * global_batch_size * 1.0 / train_time_elapsed
        ss_sentences_per_second = (FLAGS.num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed,
                        FLAGS.num_train_steps * global_batch_size)
        tf.compat.v1.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead,
                        (FLAGS.num_train_steps - training_hooks[-1].skipped) * global_batch_size)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
        dllogging.logger.log(step=(), data={"throughput_train": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("-----------------------------")

  if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
    tf.compat.v1.logging.info("***** Running evaluation *****")
    tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

    eval_files = []
    for eval_file_dir in FLAGS.eval_files_dir.split(","):
        eval_files.extend(tf.io.gfile.glob(os.path.join(eval_file_dir, "*")))

    eval_input_fn = input_fn_builder(
        input_files=eval_files,
        batch_size=FLAGS.eval_batch_size,
        max_seq_length=FLAGS.max_seq_length,
        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
        is_training=False,
        hvd=None if not FLAGS.horovod else hvd)

    eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
    eval_start_time = time.time()
    result = estimator.evaluate(
        input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks)

    eval_time_elapsed = time.time() - eval_start_time
    time_list = eval_hooks[-1].time_list
    time_list.sort()
    # Removing outliers (init/warmup) in throughput computation.
    eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)])
    num_sentences = (int(len(time_list) * 0.99)) * FLAGS.eval_batch_size

    ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

    tf.compat.v1.logging.info("-----------------------------")
    tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
                    eval_hooks[-1].count * FLAGS.eval_batch_size)
    tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
                    num_sentences)
    tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set")
    tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size)
    tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
    tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32")
    tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
    dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
    tf.compat.v1.logging.info("-----------------------------")

    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
    with tf.io.gfile.GFile(output_eval_file, "w") as writer:
      tf.compat.v1.logging.info("***** Eval results *****")
      for key in sorted(result.keys()):
        tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))
Example #8
0
    arg(
        "--type",
        type=str,
        choices=["pre", "post"],
        help="Type of task to run; pre - localization, post - damage assesment",
    )
    arg("--seed", type=int, default=1)

    parser = Model.add_model_specific_args(parser)
    args = parser.parse_args()
    if args.interpolate:
        args.deep_supervision = False
        args.dec_interp = False

    set_cuda_devices(args.gpus)
    affinity = set_affinity(os.getenv("LOCAL_RANK", "0"),
                            "socket_unique_interleaved")
    seed_everything(args.seed)
    data_module = DataModule(args)

    callbacks = None
    checkpoint = args.ckpt if args.ckpt is not None and os.path.exists(
        args.ckpt) else None
    if args.exec_mode == "train":
        model = Model(args)
        model_ckpt = ModelCheckpoint(monitor="f1_score",
                                     mode="max",
                                     save_last=True)
        callbacks = [
            EarlyStopping(monitor="f1_score",
                          patience=args.patience,
                          verbose=True,
Example #9
0
def main():
    setup_default_logging()  ## TODO(sugh) replace
    args, args_text = _parse_args()
    set_affinity(args.local_rank)
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    args.prefetcher = not args.no_prefetcher
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1
    args.device = 'cuda:0'
    args.world_size = 1
    args.rank = 0  # global rank
    if args.distributed:
        torch.cuda.manual_seed_all(args.seed)
        args.device = 'cuda:%d' % args.local_rank
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')
        args.world_size = torch.distributed.get_world_size()
        args.rank = torch.distributed.get_rank()

        # Set device limit on the current device
        # cudaLimitMaxL2FetchGranularity = 0x05
        pValue = ctypes.cast((ctypes.c_int*1)(), ctypes.POINTER(ctypes.c_int))
        _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
        _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05))
        assert pValue.contents.value == 128
    assert args.rank >= 0

    setup_dllogger(args.rank, filename=args.dllogger_file)

    if args.distributed:
        logging.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
                     % (args.rank, args.world_size))
    else:
        logging.info('Training with a single process on 1 GPU.')

    if args.waymo:
        if (args.waymo_train is not None and args.waymo_val is None) or (args.waymo_train is None and args.waymo_val is not None):
            raise Exception("waymo_train or waymo_val is not set")

    memory_format = (
        torch.channels_last if args.memory_format == "nhwc" else torch.contiguous_format
    )

    model = create_model(
        args.model,
        input_size=args.input_size,
        num_classes=args.num_classes,
        bench_task='train',
        pretrained=args.pretrained,
        pretrained_backbone_path=args.pretrained_backbone_path,
        redundant_bias=args.redundant_bias,
        checkpoint_path=args.initial_checkpoint,
        label_smoothing=args.smoothing,
        fused_focal_loss=args.fused_focal_loss,
        remove_params=args.remove_weights,
        freeze_layers=args.freeze_layers,
        strict_load=False
    )
    # FIXME decide which args to keep and overlay on config / pass to backbone
    #     num_classes=args.num_classes,
    input_size = model.config.image_size
    data_config = model.config
    print("Input size to be passed to dataloaders: {}".format(input_size))
    print("Image size used in model: {}".format(model.config.image_size))

    if args.rank == 0:
        dllogger.log(step='PARAMETER', data={'model_name':args.model, 'param_count': sum([m.numel() for m in model.parameters()])})
    model = model.cuda().to(memory_format=memory_format)

    # # optionally resume from a checkpoint

    if args.distributed:
        if args.sync_bn:
            try:
                model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
                if args.local_rank == 0:
                    logging.info(
                        'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using '
                        'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.')
            except Exception as e:
                logging.error('Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1')
    optimizer = create_optimizer(args, model)
    scaler = torch.cuda.amp.GradScaler(enabled=args.amp)

    resume_state = {}
    resume_epoch = None
    output_base = args.output if args.output else './output'
    resume_checkpoint_path = get_latest_checkpoint(os.path.join(output_base, 'train'))
    if args.resume and resume_checkpoint_path is not None:
        print("Trying to load checkpoint from {}".format(resume_checkpoint_path))
        resume_state, resume_epoch = resume_checkpoint(unwrap_bench(model), resume_checkpoint_path)
        if resume_epoch is not None:
            print("Resume training from {} epoch".format(resume_epoch))
    if resume_state and not args.no_resume_opt:
        if 'optimizer' in resume_state:
            if args.local_rank == 0:
                logging.info('Restoring Optimizer state from checkpoint')
            optimizer.load_state_dict(resume_state['optimizer'])
        if args.amp and 'scaler' in resume_state:
            if args.local_rank == 0:
                logging.info('Restoring NVIDIA AMP state from checkpoint')
            scaler.load_state_dict(resume_state['scaler'])
    del resume_state

    model_ema = None
    if args.model_ema:
        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
        if args.resume and resume_checkpoint_path is not None:
            resume_path = resume_checkpoint_path
        else:
            resume_path = ''
        model_ema = ModelEma(
            model,
            decay=args.model_ema_decay,
            resume=resume_path)

    if args.distributed:
        if args.local_rank == 0:
            logging.info("Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP.")
        model = DDP(model, device_ids=[args.device])  # can use device str in Torch >= 1.1
        # NOTE: EMA model does not need to be wrapped by DDP

    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
    start_epoch = 0
    if args.start_epoch is not None:
        # a specified start_epoch will always override the resume epoch
        start_epoch = args.start_epoch
    elif resume_epoch is not None:
        start_epoch = resume_epoch
    if lr_scheduler is not None and start_epoch > 0:
        lr_scheduler.step(start_epoch)

    if args.local_rank == 0:
        dllogger.log(step="PARAMETER", data={'Scheduled_epochs': num_epochs}, verbosity=0)

    # Benchmark will always override every other setting.
    if args.benchmark:
        start_epoch = 0
        num_epochs = args.epochs

    if args.waymo:
        train_annotation_path = args.waymo_train_annotation
        train_image_dir = args.waymo_train
    else:
        train_anno_set = 'train2017'
        train_annotation_path = os.path.join(args.data, 'annotations', f'instances_{train_anno_set}.json')
        train_image_dir = train_anno_set
    dataset_train = CocoDetection(os.path.join(args.data, train_image_dir), train_annotation_path, data_config)

    loader_train = create_loader(
        dataset_train,
        input_size=input_size,
        batch_size=args.batch_size,
        is_training=True,
        use_prefetcher=args.prefetcher,
        interpolation=args.train_interpolation,
        num_workers=args.workers,
        distributed=args.distributed,
        pin_mem=args.pin_mem,
        memory_format=memory_format
    )

    loader_train_iter = iter(loader_train)
    steps_per_epoch = int(np.ceil( len(dataset_train) / (args.world_size * args.batch_size) ))

    if args.waymo:
        val_annotation_path = args.waymo_val_annotation
        val_image_dir = args.waymo_val
    else:
        val_anno_set = 'val2017'
        val_annotation_path = os.path.join(args.data, 'annotations', f'instances_{val_anno_set}.json')
        val_image_dir = val_anno_set
    dataset_eval = CocoDetection(os.path.join(args.data, val_image_dir), val_annotation_path, data_config)

    loader_eval = create_loader(
        dataset_eval,
        input_size=input_size,
        batch_size=args.validation_batch_size_multiplier * args.batch_size,
        is_training=False,
        use_prefetcher=args.prefetcher,
        interpolation=args.interpolation,
        num_workers=args.workers,
        distributed=args.distributed,
        pin_mem=args.pin_mem,
        memory_format=memory_format
    )

    evaluator = COCOEvaluator(dataset_eval.coco, distributed=args.distributed, waymo=args.waymo)

    eval_metric = args.eval_metric
    eval_metrics = None
    train_metrics = {}
    best_metric = -1
    is_best = False
    best_epoch = None
    saver = None
    output_dir = ''
    if args.rank == 0:
        output_base = args.output if args.output else './output'
        output_dir = get_outdirectory(output_base, 'train')
        decreasing = True if eval_metric == 'loss' else False
        saver = CheckpointSaver(checkpoint_dir=output_dir)
        with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
            f.write(args_text)

    try:
        for epoch in range(start_epoch, num_epochs):
            if args.distributed:
                loader_train.sampler.set_epoch(epoch)

            train_metrics = train_epoch(
                epoch, steps_per_epoch, model, loader_train_iter, optimizer, args,
                lr_scheduler=lr_scheduler, output_dir=output_dir, use_amp=args.amp, scaler=scaler, model_ema=model_ema)

            if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
                if args.local_rank == 0:
                    logging.info("Distributing BatchNorm running means and vars")
                distribute_bn(model, args.world_size, args.dist_bn == 'reduce')

            # the overhead of evaluating with coco style datasets is fairly high, so just ema or non, not both
            if model_ema is not None:
                if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
                    distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce')
                
                if epoch >= args.eval_after:
                    eval_metrics = validate(model_ema.ema, loader_eval, args, evaluator, epoch, log_suffix=' (EMA)')
            else:
                eval_metrics = validate(model, loader_eval, args, evaluator, epoch)

            lr_scheduler.step(epoch + 1)

            if saver is not None and args.rank == 0 and epoch % args.save_checkpoint_interval == 0:
                if eval_metrics is not None:
                    # save proper checkpoint with eval metric
                    is_best = eval_metrics[eval_metric] > best_metric
                    best_metric = max(
                        eval_metrics[eval_metric],
                        best_metric
                    )
                    best_epoch = epoch
                else:
                    is_best = False
                    best_metric = 0
                saver.save_checkpoint(model, optimizer, epoch, model_ema=model_ema, metric=best_metric, is_best=is_best)


    except KeyboardInterrupt:
        dllogger.flush()
        torch.cuda.empty_cache()
    if best_metric > 0:
        train_metrics.update({'best_map': best_metric, 'best_epoch': best_epoch})
    if eval_metrics is not None:
        train_metrics.update(eval_metrics)
    dllogger.log(step=(), data=train_metrics, verbosity=0)
Example #10
0
def main(_):
    setup_xla_flags()

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)

    if FLAGS.horovod:
        hvd.init()

    processors = {
        "bc5cdr": BC5CDRProcessor,
        "clefe": CLEFEProcessor,
        'i2b2': I2b22012Processor
    }
    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    tf.io.gfile.makedirs(FLAGS.output_dir)

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    master_process = True
    training_hooks = []
    global_batch_size = FLAGS.train_batch_size
    hvd_rank = 0

    config = tf.compat.v1.ConfigProto()
    if FLAGS.horovod:
        global_batch_size = FLAGS.train_batch_size * hvd.size()
        master_process = (hvd.rank() == 0)
        hvd_rank = hvd.rank()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        set_affinity(hvd.local_rank())
        if hvd.size() > 1:
            training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
        if FLAGS.amp:
            tf.enable_resource_variables()
    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir if master_process else None,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if master_process else None,
        keep_checkpoint_max=1)

    if master_process:
        tf.compat.v1.logging.info("***** Configuaration *****")
        for key in FLAGS.__flags.keys():
            tf.compat.v1.logging.info('  {}: {}'.format(
                key, getattr(FLAGS, key)))
        tf.compat.v1.logging.info("**************************")

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        start_index = 0
        end_index = len(train_examples)
        tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]

        if FLAGS.horovod:
            tmp_filenames = [
                os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i))
                for i in range(hvd.size())
            ]
            num_examples_per_rank = len(train_examples) // hvd.size()
            remainder = len(train_examples) % hvd.size()
            if hvd.rank() < remainder:
                start_index = hvd.rank() * (num_examples_per_rank + 1)
                end_index = start_index + num_examples_per_rank + 1
            else:
                start_index = hvd.rank() * num_examples_per_rank + remainder
                end_index = start_index + (num_examples_per_rank)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list) + 1,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate
                                if not FLAGS.horovod else FLAGS.learning_rate *
                                hvd.size(),
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_one_hot_embeddings=False,
                                hvd=None if not FLAGS.horovod else hvd,
                                amp=FLAGS.amp)

    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:
        #train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        #filed_based_convert_examples_to_features(
        #    train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
        filed_based_convert_examples_to_features(
            train_examples[start_index:end_index], label_list,
            FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])
        tf.compat.v1.logging.info("***** Running training *****")
        tf.compat.v1.logging.info("  Num examples = %d", len(train_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.compat.v1.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=tmp_filenames,  #train_file,
            batch_size=FLAGS.train_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            hvd=None if not FLAGS.horovod else hvd)

        #estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
        train_start_time = time.time()
        estimator.train(input_fn=train_input_fn,
                        max_steps=num_train_steps,
                        hooks=training_hooks)
        train_time_elapsed = time.time() - train_start_time
        train_time_wo_overhead = training_hooks[-1].total_time
        avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
        ss_sentences_per_second = (
            num_train_steps - training_hooks[-1].skipped
        ) * global_batch_size * 1.0 / train_time_wo_overhead

        if master_process:
            tf.compat.v1.logging.info("-----------------------------")
            tf.compat.v1.logging.info(
                "Total Training Time = %0.2f for Sentences = %d",
                train_time_elapsed, num_train_steps * global_batch_size)
            tf.compat.v1.logging.info(
                "Total Training Time W/O Overhead = %0.2f for Sentences = %d",
                train_time_wo_overhead,
                (num_train_steps - training_hooks[-1].skipped) *
                global_batch_size)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) with overhead = %0.2f",
                avg_sentences_per_second)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) = %0.2f",
                ss_sentences_per_second)
            dllogging.logger.log(
                step=(),
                data={"throughput_train": ss_sentences_per_second},
                verbosity=Verbosity.DEFAULT)
            tf.compat.v1.logging.info("-----------------------------")

    if FLAGS.do_eval and master_process:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        filed_based_convert_examples_to_features(eval_examples, label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer, eval_file)

        tf.compat.v1.logging.info("***** Running evaluation *****")
        tf.compat.v1.logging.info("  Num examples = %d", len(eval_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
        eval_steps = None
        eval_drop_remainder = False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            batch_size=FLAGS.eval_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)
        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
            tf.compat.v1.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
                dllogging.logger.log(step=(),
                                     data={key: float(str(result[key]))},
                                     verbosity=Verbosity.DEFAULT)
                writer.write("%s = %s\n" % (key, str(result[key])))
    if FLAGS.do_predict and master_process:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples,
                                                 label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer,
                                                 predict_file,
                                                 mode="test")

        with tf.io.gfile.GFile(os.path.join(FLAGS.output_dir, 'label2id.pkl'),
                               'rb') as rf:
            label2id = pickle.load(rf)
            id2label = {value: key for key, value in label2id.items()}
        token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
        if tf.io.gfile.exists(token_path):
            tf.io.gfile.remove(token_path)

        tf.compat.v1.logging.info("***** Running prediction*****")
        tf.compat.v1.logging.info("  Num examples = %d", len(predict_examples))
        tf.compat.v1.logging.info("  Batch size = %d",
                                  FLAGS.predict_batch_size)

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            batch_size=FLAGS.predict_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
        eval_start_time = time.time()

        output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt")
        test_labels_file = os.path.join(FLAGS.output_dir, "test_labels.txt")
        test_labels_err_file = os.path.join(FLAGS.output_dir,
                                            "test_labels_errs.txt")
        with tf.io.gfile.GFile(output_predict_file, 'w') as writer, \
                tf.io.gfile.GFile(test_labels_file, 'w') as tl, \
                tf.io.gfile.GFile(test_labels_err_file, 'w') as tle:
            print(id2label)
            i = 0
            for prediction in estimator.predict(input_fn=predict_input_fn,
                                                hooks=eval_hooks,
                                                yield_single_examples=True):
                output_line = "\n".join(id2label[id]
                                        for id in prediction if id != 0) + "\n"
                writer.write(output_line)
                result_to_pair(predict_examples[i], prediction, id2label, tl,
                               tle)
                i = i + 1

        eval_time_elapsed = time.time() - eval_start_time

        time_list = eval_hooks[-1].time_list
        time_list.sort()
        # Removing outliers (init/warmup) in throughput computation.
        eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)])
        num_sentences = (int(len(time_list) * 0.99)) * FLAGS.predict_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info(
            "Total Inference Time = %0.2f for Sentences = %d",
            eval_time_elapsed, eval_hooks[-1].count * FLAGS.predict_batch_size)
        tf.compat.v1.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            eval_time_wo_overhead, num_sentences)
        tf.compat.v1.logging.info("Summary Inference Statistics")
        tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.compat.v1.logging.info("Precision = %s",
                                  "fp16" if FLAGS.amp else "fp32")
        tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f",
                                  cf_50 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f",
                                  cf_90 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f",
                                  cf_95 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f",
                                  cf_99 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f",
                                  cf_100 * 1000)
        tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f",
                                  ss_sentences_per_second)
        dllogging.logger.log(step=(),
                             data={"throughput_val": ss_sentences_per_second},
                             verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("-----------------------------")

        tf.compat.v1.logging.info('Reading: %s', test_labels_file)
        with tf.io.gfile.GFile(test_labels_file, "r") as f:
            counts = evaluate(f)
        eval_result = report_notprint(counts)
        print(''.join(eval_result))
        with tf.io.gfile.GFile(
                os.path.join(FLAGS.output_dir, 'test_results_conlleval.txt'),
                'w') as fd:
            fd.write(''.join(eval_result))