Ejemplo n.º 1
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device: Any = torch.device("cpu")
    else:
        # Get the current device as set for current distributed process.
        # Check `launch` function in `virtex.utils.distributed` module.
        device = torch.cuda.current_device()

    # Create a config object (this will be immutable) and perform common setup
    # such as logging and setting up serialization directory.
    _C = Config(_A.config, _A.config_override)
    common_setup(_C, _A)

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER
    # -------------------------------------------------------------------------
    train_dataset = PretrainingDatasetFactory.from_config(_C, split="train")
    val_dataset = PretrainingDatasetFactory.from_config(_C,
                                                        split="val",
                                                        all_captions=True)
    train_dataset_no_image = PretrainingDatasetFactory.from_config(
        _C, split="train", all_captions=True, include_image=False)
    val_dataset_no_image = PretrainingDatasetFactory.from_config(
        _C, split="val", all_captions=True, include_image=False)

    # Make `DistributedSampler`s to shard datasets across GPU processes.
    # Skip this if training on CPUs.
    train_sampler = (
        DistributedSampler(train_dataset, shuffle=True)  # type: ignore
        if _A.num_gpus_per_machine > 0 else None)
    val_sampler = (
        DistributedSampler(val_dataset, shuffle=False)  # type: ignore
        if _A.num_gpus_per_machine > 0 else None)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=train_sampler,
        shuffle=train_sampler is None,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=True,
        collate_fn=train_dataset.collate_fn,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=val_sampler,
        shuffle=False,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )

    train_dataloader_no_image = DataLoader(
        train_dataset_no_image,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        shuffle=False,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )
    evaluator = CiderEvaluator(train_dataloader_no_image, prefix='train')

    val_dataloader_no_image = DataLoader(
        val_dataset_no_image,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        shuffle=False,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )
    evaluator_val = CiderEvaluator(val_dataloader_no_image, prefix='val')

    # Load supervised trained model
    model = PretrainingModelFactory.from_config(_C).to(device)
    CheckpointManager(model=model).load(_A.start_checkpoint)

    optimizer = OptimizerFactory.from_config(_C, model.named_parameters())
    scheduler = LRSchedulerFactory.from_config(_C, optimizer)

    if dist.is_master_process():
        print(
            'total parameters:',
            sum([
                np.prod(p.shape) for p in model.parameters() if p.requires_grad
            ]))
        print(
            f'train data: {len(train_dataloader)}, val data: {len(val_dataloader)}'
        )

    tokenizer = train_dataset.tokenizer

    # -------------------------------------------------------------------------
    #   BEFORE TRAINING STARTS
    # -------------------------------------------------------------------------

    # Create a gradient scaler for automatic mixed precision.
    scaler = amp.GradScaler(enabled=_C.AMP)

    # Load checkpoint to resume training if specified.
    if _A.resume_from is not None:
        start_iteration = CheckpointManager(
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            scaler=scaler,
        ).load(_A.resume_from)
    else:
        start_iteration = 0

    # Create an iterator from dataloader to sample batches perpetually.
    train_dataloader_iter = cycle(train_dataloader, device, start_iteration)

    # Wrap model in DDP if using more than one processes.
    if dist.get_world_size() > 1:
        dist.synchronize()
        model = dist.DistributedDataParallel(model,
                                             device_ids=[device],
                                             find_unused_parameters=False)

    # Keep track of time per iteration and ETA.
    timer = Timer(start_from=start_iteration + 1,
                  total_iterations=_C.OPTIM.NUM_ITERATIONS)
    # Create tensorboard writer and checkpoint manager (only in master process).
    if dist.is_master_process():
        tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
        tensorboard_writer.add_text("config", f"```\n{_C}\n```")

        checkpoint_manager = CheckpointManager(
            _A.serialization_dir,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            scaler=scaler,
        )

    # -------------------------------------------------------------------------
    #   TRAINING LOOP
    # -------------------------------------------------------------------------
    for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1):
        timer.tic()
        optimizer.zero_grad()
        batch = next(train_dataloader_iter)

        with amp.autocast(enabled=_C.AMP):
            model.sample_on()
            model.eval()
            with torch.no_grad():
                greedy_dec = model({"image": batch["image"]},
                                   sample_mode="greedy")['predictions']
                out = model({"image": batch["image"]},
                            sample_mode="sample",
                            n_samples_per_image=5)
                sample_dec, caption_lengths = out['predictions'], out[
                    'caption_lengths']
            model.train()
            model.sample_off()

            sample_log_probs = -model(
                {
                    "image": batch["image"],
                    "caption_tokens": sample_dec,
                    "caption_lengths": caption_lengths
                },
                loss_reduction='none')['loss']

            image_ids = batch['image_id'].tolist()
            reward = compute_scts_reward(image_ids, greedy_dec[:, 1:],
                                         sample_dec[:,
                                                    1:], tokenizer, evaluator)
            reward = torch.from_numpy(reward).to(device)

            mask = sample_dec[:, 1:] != tokenizer.pad_id
            loss = -sample_log_probs * reward * mask
            loss = loss.sum() / mask.sum()
        scaler.scale(loss).backward()

        # First clip norm of gradients, and then perform optimizer step.
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       _C.OPTIM.CLIP_GRAD_NORM)
        scaler.step(optimizer)

        scaler.update()
        scheduler.step()
        timer.toc()

        # ---------------------------------------------------------------------
        #   LOGGING
        # ---------------------------------------------------------------------
        if iteration % _A.log_every == 0:
            logger.info(
                f"{timer.stats} [Reward {-loss:.3f}] [GPU {dist.gpu_mem_usage()} MB]"
            )
            if dist.is_master_process():
                tensorboard_writer.add_scalars(
                    "learning_rate",
                    {
                        "visual": optimizer.param_groups[0]["lr"],
                        "common": optimizer.param_groups[-1]["lr"],
                    },
                    iteration,
                )

        # ---------------------------------------------------------------------
        #   VALIDATION
        # ---------------------------------------------------------------------
        if iteration % _A.checkpoint_every == 0:
            if dist.is_master_process():
                checkpoint_manager.step(iteration)

            # All processes will wait till master process is done serializing.
            dist.synchronize()

            torch.set_grad_enabled(False)
            model.eval()

            predictions: List[Dict[str, Any]] = []
            if dist.is_master_process():
                pbar = tqdm(total=len(val_dataloader))
            for val_iteration, val_batch in enumerate(val_dataloader, start=1):
                val_batch = {
                    'image_id': val_batch['image_id'].to(device),
                    'image': val_batch['image'].to(device)
                }
                output_dict = model(val_batch)

                for image_id, caption in zip(val_batch['image_id'],
                                             output_dict['predictions'][:,
                                                                        1:]):
                    predictions.append({
                        'image_id':
                        image_id.item(),
                        'caption':
                        tokenizer.decode(caption.tolist())
                    })
                if dist.is_master_process():
                    pbar.update(1)
            if dist.is_master_process():
                pbar.close()

            metrics = evaluator_val.evaluate(predictions)
            metrics = {
                k: torch.tensor(v, dtype=torch.float, device=device)
                for k, v in metrics.items()
            }
            dist.average_across_processes(metrics)
            metrics = {k: v.item() for k, v in metrics.items()}

            torch.set_grad_enabled(True)
            model.train()

            if dist.is_master_process():
                logger.info(f"Iteration: {iteration} | Metrics: {metrics}")
                tensorboard_writer.add_scalars("val", metrics, iteration)

        if iteration % _A.checkpoint_every == 0:
            torch.set_grad_enabled(False)
            model.eval()

            batch = next(iter(val_dataloader))
            batch = {"image": batch["image"][:8].to(device)}
            predictions = model(batch)["predictions"].cpu()

            captions = []
            for i in range(predictions.shape[0]):
                caption = tokenizer.decode(predictions[i].tolist())
                captions.append(caption)

            mean = torch.tensor(IMAGENET_COLOR_MEAN,
                                dtype=torch.float).view(1, 3, 1, 1)
            std = torch.tensor(IMAGENET_COLOR_STD,
                               dtype=torch.float).view(1, 3, 1, 1)
            image = batch["image"].cpu() * std + mean

            if dist.is_master_process():
                logger.info(f"Sample Generated Captions:")
                log_text = ""
                for i, caption in enumerate(captions):
                    logger.info(f"\t{caption}")
                    log_text += f"{caption}\n\n"
                tensorboard_writer.add_text(f"samples_itr{iteration}",
                                            log_text, iteration)
                tensorboard_writer.add_images(f"samples_itr{iteration}", image,
                                              iteration)

            torch.set_grad_enabled(True)
            model.train()
Ejemplo n.º 2
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device: Any = torch.device("cpu")
    else:
        # Get the current device as set for current distributed process.
        # Check `launch` function in `virtex.utils.distributed` module.
        device = torch.cuda.current_device()

    # Create a config object (this will be immutable) and perform common setup
    # such as logging and setting up serialization directory.
    _C = Config(_A.config, _A.config_override)
    common_setup(_C, _A)

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER
    # -------------------------------------------------------------------------
    train_dataset = PretrainingDatasetFactory.from_config(_C, split="train")
    val_dataset = PretrainingDatasetFactory.from_config(_C, split="val")

    # Make `DistributedSampler`s to shard datasets across GPU processes.
    # Skip this if training on CPUs.
    train_sampler = (
        DistributedSampler(train_dataset, shuffle=True)  # type: ignore
        if _A.num_gpus_per_machine > 0 else None)
    val_sampler = (
        DistributedSampler(val_dataset, shuffle=False)  # type: ignore
        if _A.num_gpus_per_machine > 0 else None)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=train_sampler,
        shuffle=train_sampler is None,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=True,
        collate_fn=train_dataset.collate_fn,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=val_sampler,
        shuffle=False,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )

    model = PretrainingModelFactory.from_config(_C).to(device)
    optimizer = OptimizerFactory.from_config(_C, model.named_parameters())
    scheduler = LRSchedulerFactory.from_config(_C, optimizer)

    # -------------------------------------------------------------------------
    #   BEFORE TRAINING STARTS
    # -------------------------------------------------------------------------

    # Create a gradient scaler for automatic mixed precision.
    scaler = amp.GradScaler(enabled=_C.AMP)

    # Load checkpoint to resume training if specified.
    if _A.resume_from is not None:
        start_iteration = CheckpointManager(
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            scaler=scaler,
        ).load(_A.resume_from)
    else:
        start_iteration = 0

    # Create an iterator from dataloader to sample batches perpetually.
    train_dataloader_iter = cycle(train_dataloader, device, start_iteration)

    # Wrap model in DDP if using more than one processes.
    if dist.get_world_size() > 1:
        dist.synchronize()
        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[device], find_unused_parameters=True)

    # Keep track of time per iteration and ETA.
    timer = Timer(start_from=start_iteration + 1,
                  total_iterations=_C.OPTIM.NUM_ITERATIONS)
    # Create tensorboard writer and checkpoint manager (only in master process).
    if dist.is_master_process():
        tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
        tensorboard_writer.add_text("config", f"```\n{_C}\n```")

        checkpoint_manager = CheckpointManager(
            _A.serialization_dir,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            scaler=scaler,
        )

    # -------------------------------------------------------------------------
    #   TRAINING LOOP
    # -------------------------------------------------------------------------
    for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1):
        timer.tic()
        optimizer.zero_grad()
        batch = next(train_dataloader_iter)

        with amp.autocast(enabled=_C.AMP):
            output_dict = model(batch)
            loss = output_dict["loss"]

        scaler.scale(loss).backward()

        # First clip norm of gradients, and then perform optimizer step.
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       _C.OPTIM.CLIP_GRAD_NORM)
        scaler.step(optimizer)

        scaler.update()
        scheduler.step()
        timer.toc()

        # ---------------------------------------------------------------------
        #   LOGGING
        # ---------------------------------------------------------------------
        if iteration % _A.log_every == 0:
            logger.info(
                f"{timer.stats} [Loss {loss:.3f}] [GPU {dist.gpu_mem_usage()} MB]"
            )
            if dist.is_master_process():
                tensorboard_writer.add_scalars(
                    "learning_rate",
                    {
                        "visual": optimizer.param_groups[0]["lr"],
                        "common": optimizer.param_groups[-1]["lr"],
                    },
                    iteration,
                )
                tensorboard_writer.add_scalars("train",
                                               output_dict["loss_components"],
                                               iteration)

        # ---------------------------------------------------------------------
        #   VALIDATION
        # ---------------------------------------------------------------------
        if iteration % _A.checkpoint_every == 0:
            if dist.is_master_process():
                checkpoint_manager.step(iteration)

            # All processes will wait till master process is done serializing.
            dist.synchronize()

            torch.set_grad_enabled(False)
            model.eval()

            # Accumulate different val loss components according to the type of
            # pretraining model.
            val_loss_counter: Counter = Counter()

            for val_iteration, val_batch in enumerate(val_dataloader, start=1):
                for key in val_batch:
                    val_batch[key] = val_batch[key].to(device)
                output_dict = model(val_batch)

                val_loss_counter.update(output_dict["loss_components"])

            # Divide each loss component by number of val batches per GPU.
            val_loss_dict = {
                k: v / val_iteration
                for k, v in dict(val_loss_counter).items()
            }
            dist.average_across_processes(val_loss_dict)
            torch.set_grad_enabled(True)
            model.train()

            logger.info(f"Iteration: {iteration} [Val loss: {val_loss_dict}]")
            if dist.is_master_process():
                tensorboard_writer.add_scalars("val", val_loss_dict, iteration)
Ejemplo n.º 3
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
    else:
        # Get the current device as set for current distributed process.
        # Check `launch` function in `virtex.utils.distributed` module.
        device = torch.cuda.current_device()

    # Create a downstream config object (this will be immutable) and perform
    # common setup such as logging and setting up serialization directory.
    _DOWNC = Config(_A.down_config, _A.down_config_override)
    common_setup(_DOWNC, _A, job_type="downstream")

    # Create a (pretraining) config object and backup in serializaion directory.
    _C = Config(_A.config, _A.config_override)
    _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml"))

    # Get dataset name for tensorboard logging.
    DATASET = _DOWNC.DATA.ROOT.split("/")[-1]

    # Set number of output classes according to dataset:
    NUM_CLASSES_MAPPING = {"imagenet": 1000, "inaturalist": 8142}
    NUM_CLASSES = NUM_CLASSES_MAPPING[DATASET]

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER
    # -------------------------------------------------------------------------
    train_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="train")
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(),
        num_workers=_A.cpu_workers,
        sampler=DistributedSampler(
            train_dataset,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank(),
            shuffle=True,
        ),
        drop_last=False,
        pin_memory=True,
        collate_fn=train_dataset.collate_fn,
    )
    val_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="val")
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(),
        num_workers=_A.cpu_workers,
        sampler=DistributedSampler(
            val_dataset,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank(),
            shuffle=False,
        ),
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )
    # Initialize model using pretraining config.
    pretrained_model = PretrainingModelFactory.from_config(_C)

    # Load weights according to the init method, do nothing for `random`, and
    # `imagenet` is already taken care of.
    if _A.weight_init == "virtex":
        CheckpointManager(model=pretrained_model).load(_A.checkpoint_path)
    elif _A.weight_init == "torchvision":
        # Keep strict=False because this state dict may have weights for
        # last fc layer.
        pretrained_model.visual.cnn.load_state_dict(
            torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"],
            strict=False,
        )

    # Pull out the CNN (torchvision-like) from our pretrained model and add
    # back the FC layer - this is exists in torchvision models, and is set to
    # `nn.Identity()` during pretraining.
    model = pretrained_model.visual.cnn  # type: ignore
    model.fc = nn.Linear(_DOWNC.MODEL.VISUAL.FEATURE_SIZE,
                         NUM_CLASSES).to(device)
    model = model.to(device)

    # Re-initialize the FC layer.
    torch.nn.init.normal_(model.fc.weight.data, mean=0.0, std=0.01)
    torch.nn.init.constant_(model.fc.bias.data, 0.0)

    # Freeze all layers except FC as per config param.
    if _DOWNC.MODEL.VISUAL.FROZEN:
        for name, param in model.named_parameters():
            if "fc" not in name:
                param.requires_grad = False

    # Cross entropy loss and accuracy meter.
    criterion = nn.CrossEntropyLoss()
    top1 = TopkAccuracy(top_k=1)

    optimizer = OptimizerFactory.from_config(_DOWNC, model.named_parameters())
    scheduler = LRSchedulerFactory.from_config(_DOWNC, optimizer)
    del pretrained_model

    # -------------------------------------------------------------------------
    #  BEFORE TRAINING STARTS
    # -------------------------------------------------------------------------

    # Create an iterator from dataloader to sample batches perpetually.
    train_dataloader_iter = cycle(train_dataloader, device)

    # Wrap model and optimizer using NVIDIA Apex for mixed precision training.
    # NOTE: Always do this before wrapping model with DistributedDataParallel.
    if _DOWNC.FP16_OPT > 0:
        from apex import amp

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=f"O{_DOWNC.FP16_OPT}")

    if dist.get_world_size() > 1:
        dist.synchronize()
        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[device], find_unused_parameters=True)

    if dist.is_master_process():
        checkpoint_manager = CheckpointManager(
            _A.serialization_dir,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
        )
        tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)

    # Keep track of time per iteration and ETA.
    timer = Timer(start_from=1, total_iterations=_DOWNC.OPTIM.NUM_ITERATIONS)

    # -------------------------------------------------------------------------
    #   TRAINING LOOP
    # -------------------------------------------------------------------------
    for iteration in range(1, _DOWNC.OPTIM.NUM_ITERATIONS + 1):
        timer.tic()
        optimizer.zero_grad()
        batch = next(train_dataloader_iter)

        logits = model(batch["image"])
        loss = criterion(logits, batch["label"])

        # Perform dynamic scaling of loss to adjust for mixed precision.
        if _DOWNC.FP16_OPT > 0:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        optimizer.step()
        scheduler.step(iteration)
        timer.toc()

        if iteration % _A.log_every == 0 and dist.is_master_process():
            logger.info(
                f"{timer.stats} | Loss: {loss:.3f} | GPU: {dist.gpu_mem_usage()} MB"
            )
            tensorboard_writer.add_scalar(f"{DATASET}/train_loss", loss,
                                          iteration)
            tensorboard_writer.add_scalar(
                f"{DATASET}/learning_rate",
                optimizer.param_groups[0]["lr"],
                iteration,
            )

        # ---------------------------------------------------------------------
        #   VALIDATION
        # ---------------------------------------------------------------------
        if iteration % _A.checkpoint_every == 0:
            torch.set_grad_enabled(False)
            model.eval()

            total_val_loss = torch.tensor(0.0).to(device)

            for val_iteration, batch in enumerate(val_dataloader, start=1):
                for key in batch:
                    batch[key] = batch[key].to(device)

                logits = model(batch["image"])
                loss = criterion(logits, batch["label"])
                top1(logits, batch["label"])
                total_val_loss += loss

            # Divide each loss component by number of val batches per GPU.
            total_val_loss = total_val_loss / val_iteration
            dist.average_across_processes(total_val_loss)

            # Get accumulated Top-1 accuracy for logging across GPUs.
            acc = top1.get_metric(reset=True)
            dist.average_across_processes(acc)

            torch.set_grad_enabled(True)
            model.train()

            # Save recent checkpoint and best checkpoint based on accuracy.
            if dist.is_master_process():
                checkpoint_manager.step(iteration)

        if iteration % _A.checkpoint_every == 0 and dist.is_master_process():
            logger.info(f"Iter: {iteration} | Top-1 accuracy: {acc})")
            tensorboard_writer.add_scalar(f"{DATASET}/val_loss",
                                          total_val_loss, iteration)
            # This name scoping will result in Tensorboard displaying all metrics
            # (VOC07, caption, etc.) together.
            tensorboard_writer.add_scalars(f"metrics/{DATASET}", {"top1": acc},
                                           iteration)

        # All processes will wait till master process is done logging.
        dist.synchronize()
Ejemplo n.º 4
0
def main(_A: argparse.Namespace):
    apex = False
    is_cpu = False
    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
        is_cpu = True
    else:
        # Get the current device as set for current distributed process.
        # Check `launch` function in `virtex.utils.distributed` module.
        device = torch.cuda.current_device()

    # Create a config object (this will be immutable) and perform common setup
    # such as logging and setting up serialization directory.
    _C = Config(_A.config, _A.config_override)
    common_setup(_C, _A)

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER
    # -------------------------------------------------------------------------
    tokenizer = TokenizerFactory.from_config(_C)
    train_dataset = PretrainingDatasetFactory.from_config(_C,
                                                          split="train",
                                                          csv=_A.train_csv)
    val_dataset = PretrainingDatasetFactory.from_config(_C,
                                                        split="val",
                                                        csv=_A.val_csv)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        #sampler= Sampler(train_dataset),
        sampler=DistributedSampler(train_dataset, shuffle=True),
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=True,
        collate_fn=train_dataset.collate_fn,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        # sampler = Sampler(val_dataset),
        sampler=DistributedSampler(val_dataset, shuffle=False),
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )

    model = PretrainingModelFactory.from_config(_C).to(device)
    optimizer = OptimizerFactory.from_config(_C, model.named_parameters())
    scheduler = LRSchedulerFactory.from_config(_C, optimizer)

    # -------------------------------------------------------------------------
    #   BEFORE TRAINING STARTS
    # -------------------------------------------------------------------------

    # Load checkpoint to resume training if specified.
    if _A.resume_from is not None:
        start_iteration = CheckpointManager(model=model,
                                            optimizer=optimizer,
                                            scheduler=scheduler).load(
                                                _A.resume_from)
    else:
        start_iteration = 0

    # Keep track of time per iteration and ETA.
    timer = Timer(
        start_from=start_iteration + 1,
        total_iterations=_C.OPTIM.NUM_ITERATIONS,
    )
    # Create an iterator from dataloader to sample batches perpetually.
    train_dataloader_iter = cycle(train_dataloader, device, start_iteration)

    if (not is_cpu):
        # Wrap model and optimizer using NVIDIA Apex for mixed precision training.
        # NOTE: Always do this before wrapping model with DistributedDataParallel.
        if apex:
            if _C.FP16_OPT > 0:
                from apex import amp

                model, optimizer = amp.initialize(model,
                                                  optimizer,
                                                  opt_level=f"O{_C.FP16_OPT}")

        # Wrap model in DDP if using more than one processes.
        if dist.get_world_size() > 1:
            dist.synchronize()
            model = nn.parallel.DistributedDataParallel(
                model, device_ids=[device], find_unused_parameters=True)

        # Create checkpoint manager and tensorboard writer (only in master process).
        if dist.is_master_process():
            checkpoint_manager = CheckpointManager(
                _A.serialization_dir,
                model=model,
                optimizer=optimizer,
                scheduler=scheduler,
            )
            tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
            tensorboard_writer.add_text("config", f"```\n{_C}\n```")

    # -------------------------------------------------------------------------
    #   TRAINING LOOP
    # -------------------------------------------------------------------------
    for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1):
        timer.tic()
        optimizer.zero_grad()

        batch_loss = torch.tensor(0.0, device=device)

        batch = next(train_dataloader_iter)
        output_dict = model(batch)

        loss = output_dict["loss"]
        batch_loss += loss.item()

        # Perform dynamic scaling of loss to adjust for mixed precision.
        if apex and _C.FP16_OPT > 0:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        # Clip norm of gradients before optimizer step.
        torch.nn.utils.clip_grad_norm_(
            amp.master_params(optimizer)
            if apex and _C.FP16_OPT > 0 else model.parameters(),
            _C.OPTIM.CLIP_GRAD_NORM,
        )
        optimizer.step()
        scheduler.step(iteration)
        timer.toc()

        # ---------------------------------------------------------------------
        #   TENSORBOARD LOGGING
        # ---------------------------------------------------------------------
        if iteration % _A.log_every == 0 and dist.is_master_process():
            logger.info(f"{timer.stats} | Loss: {batch_loss:.3f} | "
                        f"GPU mem: {dist.gpu_mem_usage()} MB")
            tensorboard_writer.add_scalars(
                "learning_rate",
                {
                    "visual": optimizer.param_groups[0]["lr"],
                    "common": optimizer.param_groups[-1]["lr"],
                },
                iteration,
            )
            tensorboard_writer.add_scalars("train",
                                           output_dict["loss_components"],
                                           iteration)

        # ---------------------------------------------------------------------
        #   VALIDATION
        # ---------------------------------------------------------------------
        if iteration % _A.checkpoint_every == 0:
            if dist.is_master_process():
                checkpoint_manager.step(iteration)

            torch.set_grad_enabled(False)
            model.eval()

            # Accumulate different val loss components according to the type of
            # pretraining model.
            val_loss_counter: Counter = Counter()

            for val_iteration, val_batch in enumerate(val_dataloader, start=1):
                for key in val_batch:
                    val_batch[key] = val_batch[key].to(device)
                output_dict = model(val_batch)

                val_loss_counter.update(output_dict["loss_components"])

            # Divide each loss component by number of val batches per GPU.
            val_loss_dict = {
                k: v / val_iteration
                for k, v in dict(val_loss_counter).items()
            }
            dist.average_across_processes(val_loss_dict)
            torch.set_grad_enabled(True)
            model.train()

        if iteration % _A.checkpoint_every == 0 and dist.is_master_process():
            logger.info(f"Iter: {iteration} | Val loss: {val_loss_dict}")
            tensorboard_writer.add_scalars("val", val_loss_dict, iteration)

        # All processes will wait till master process is done logging.
        dist.synchronize()