Esempio n. 1
0
    def _get_captions(self, dataloader):
        if dist.is_master_process():
            pbar = tqdm(total=len(dataloader))

        annotations = []
        for batch in dataloader:
            annotations.extend([{
                'image_id': image_id.item(),
                'caption': c
            } for image_id, caption in zip(batch['image_id'], batch['caption'])
                                for c in caption])
            if dist.is_master_process():
                pbar.update(1)
        if dist.is_master_process():
            pbar.close()
        return annotations
Esempio n. 2
0
    def __init__(self, val_dataloader, prefix):
        cache_filename = f'{prefix}_{val_dataloader.dataset.name}_cache.json'
        if not os.path.exists(cache_filename):
            annotations = self._get_captions(val_dataloader)

            self.ground_truth: Dict[int, List[str]] = defaultdict(list)
            for ann in annotations:
                self.ground_truth[ann["image_id"]].append(ann["caption"])

            self.ground_truth = tokenize(self.ground_truth)

            if dist.is_master_process():
                f = open(cache_filename, 'w')
                json.dump(self.ground_truth, f)
        else:
            f = open(cache_filename, 'r')
            self.ground_truth = json.load(f)
            self.ground_truth = {
                int(k): v
                for k, v in self.ground_truth.items()
            }
Esempio n. 3
0
def common_setup(_C: Config,
                 _A: argparse.Namespace,
                 job_type: str = "pretrain"):
    r"""
    Setup common stuff at the start of every pretraining or downstream
    evaluation job, all listed here to avoid code duplication. Basic steps:

    1. Fix random seeds and other PyTorch flags.
    2. Set up a serialization directory and loggers.
    3. Log important stuff such as config, process info (useful during
        distributed training).
    4. Save a copy of config to serialization directory.

    .. note::

        It is assumed that multiple processes for distributed training have
        already been launched from outside. Functions from
        :mod:`virtex.utils.distributed` module ae used to get process info.

    Args:
        _C: Config object with all the parameters.
        _A: Argparse command line arguments.
        job_type: Type of job for which setup is to be done; one of
            ``{"pretrain", "downstream"}``.
    """

    # Get process rank and world size (assuming distributed is initialized).
    RANK = dist.get_rank()
    WORLD_SIZE = dist.get_world_size()

    # For reproducibility - refer https://pytorch.org/docs/stable/notes/randomness.html
    torch.manual_seed(_C.RANDOM_SEED)
    torch.backends.cudnn.deterministic = _C.CUDNN_DETERMINISTIC
    torch.backends.cudnn.benchmark = _C.CUDNN_BENCHMARK
    random.seed(_C.RANDOM_SEED)
    np.random.seed(_C.RANDOM_SEED)

    # Create serialization directory and save config in it.
    os.makedirs(_A.serialization_dir, exist_ok=True)
    _C.dump(os.path.join(_A.serialization_dir, f"{job_type}_config.yaml"))

    # Remove default logger, create a logger for each process which writes to a
    # separate log-file. This makes changes in global scope.
    logger.remove(0)
    if dist.get_world_size() > 1:
        logger.add(
            os.path.join(_A.serialization_dir, f"log-rank{RANK}.txt"),
            format="{time} {level} {message}",
        )

    # Add a logger for stdout only for the master process.
    if dist.is_master_process():
        logger.add(sys.stdout,
                   format="<g>{time}</g>: <lvl>{message}</lvl>",
                   colorize=True)

    # Print process info, config and args.
    logger.info(f"Rank of current process: {RANK}. World size: {WORLD_SIZE}")
    logger.info(str(_C))

    logger.info("Command line args:")
    for arg in vars(_A):
        logger.info("{:<20}: {}".format(arg, getattr(_A, arg)))
Esempio n. 4
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device: Any = torch.device("cpu")
    else:
        # Get the current device as set for current distributed process.
        # Check `launch` function in `virtex.utils.distributed` module.
        device = torch.cuda.current_device()

    # Create a config object (this will be immutable) and perform common setup
    # such as logging and setting up serialization directory.
    _C = Config(_A.config, _A.config_override)
    common_setup(_C, _A)

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER
    # -------------------------------------------------------------------------
    train_dataset = PretrainingDatasetFactory.from_config(_C, split="train")
    val_dataset = PretrainingDatasetFactory.from_config(_C,
                                                        split="val",
                                                        all_captions=True)
    train_dataset_no_image = PretrainingDatasetFactory.from_config(
        _C, split="train", all_captions=True, include_image=False)
    val_dataset_no_image = PretrainingDatasetFactory.from_config(
        _C, split="val", all_captions=True, include_image=False)

    # Make `DistributedSampler`s to shard datasets across GPU processes.
    # Skip this if training on CPUs.
    train_sampler = (
        DistributedSampler(train_dataset, shuffle=True)  # type: ignore
        if _A.num_gpus_per_machine > 0 else None)
    val_sampler = (
        DistributedSampler(val_dataset, shuffle=False)  # type: ignore
        if _A.num_gpus_per_machine > 0 else None)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=train_sampler,
        shuffle=train_sampler is None,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=True,
        collate_fn=train_dataset.collate_fn,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=val_sampler,
        shuffle=False,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )

    train_dataloader_no_image = DataLoader(
        train_dataset_no_image,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        shuffle=False,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )
    evaluator = CiderEvaluator(train_dataloader_no_image, prefix='train')

    val_dataloader_no_image = DataLoader(
        val_dataset_no_image,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        shuffle=False,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )
    evaluator_val = CiderEvaluator(val_dataloader_no_image, prefix='val')

    # Load supervised trained model
    model = PretrainingModelFactory.from_config(_C).to(device)
    CheckpointManager(model=model).load(_A.start_checkpoint)

    optimizer = OptimizerFactory.from_config(_C, model.named_parameters())
    scheduler = LRSchedulerFactory.from_config(_C, optimizer)

    if dist.is_master_process():
        print(
            'total parameters:',
            sum([
                np.prod(p.shape) for p in model.parameters() if p.requires_grad
            ]))
        print(
            f'train data: {len(train_dataloader)}, val data: {len(val_dataloader)}'
        )

    tokenizer = train_dataset.tokenizer

    # -------------------------------------------------------------------------
    #   BEFORE TRAINING STARTS
    # -------------------------------------------------------------------------

    # Create a gradient scaler for automatic mixed precision.
    scaler = amp.GradScaler(enabled=_C.AMP)

    # Load checkpoint to resume training if specified.
    if _A.resume_from is not None:
        start_iteration = CheckpointManager(
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            scaler=scaler,
        ).load(_A.resume_from)
    else:
        start_iteration = 0

    # Create an iterator from dataloader to sample batches perpetually.
    train_dataloader_iter = cycle(train_dataloader, device, start_iteration)

    # Wrap model in DDP if using more than one processes.
    if dist.get_world_size() > 1:
        dist.synchronize()
        model = dist.DistributedDataParallel(model,
                                             device_ids=[device],
                                             find_unused_parameters=False)

    # Keep track of time per iteration and ETA.
    timer = Timer(start_from=start_iteration + 1,
                  total_iterations=_C.OPTIM.NUM_ITERATIONS)
    # Create tensorboard writer and checkpoint manager (only in master process).
    if dist.is_master_process():
        tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
        tensorboard_writer.add_text("config", f"```\n{_C}\n```")

        checkpoint_manager = CheckpointManager(
            _A.serialization_dir,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            scaler=scaler,
        )

    # -------------------------------------------------------------------------
    #   TRAINING LOOP
    # -------------------------------------------------------------------------
    for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1):
        timer.tic()
        optimizer.zero_grad()
        batch = next(train_dataloader_iter)

        with amp.autocast(enabled=_C.AMP):
            model.sample_on()
            model.eval()
            with torch.no_grad():
                greedy_dec = model({"image": batch["image"]},
                                   sample_mode="greedy")['predictions']
                out = model({"image": batch["image"]},
                            sample_mode="sample",
                            n_samples_per_image=5)
                sample_dec, caption_lengths = out['predictions'], out[
                    'caption_lengths']
            model.train()
            model.sample_off()

            sample_log_probs = -model(
                {
                    "image": batch["image"],
                    "caption_tokens": sample_dec,
                    "caption_lengths": caption_lengths
                },
                loss_reduction='none')['loss']

            image_ids = batch['image_id'].tolist()
            reward = compute_scts_reward(image_ids, greedy_dec[:, 1:],
                                         sample_dec[:,
                                                    1:], tokenizer, evaluator)
            reward = torch.from_numpy(reward).to(device)

            mask = sample_dec[:, 1:] != tokenizer.pad_id
            loss = -sample_log_probs * reward * mask
            loss = loss.sum() / mask.sum()
        scaler.scale(loss).backward()

        # First clip norm of gradients, and then perform optimizer step.
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       _C.OPTIM.CLIP_GRAD_NORM)
        scaler.step(optimizer)

        scaler.update()
        scheduler.step()
        timer.toc()

        # ---------------------------------------------------------------------
        #   LOGGING
        # ---------------------------------------------------------------------
        if iteration % _A.log_every == 0:
            logger.info(
                f"{timer.stats} [Reward {-loss:.3f}] [GPU {dist.gpu_mem_usage()} MB]"
            )
            if dist.is_master_process():
                tensorboard_writer.add_scalars(
                    "learning_rate",
                    {
                        "visual": optimizer.param_groups[0]["lr"],
                        "common": optimizer.param_groups[-1]["lr"],
                    },
                    iteration,
                )

        # ---------------------------------------------------------------------
        #   VALIDATION
        # ---------------------------------------------------------------------
        if iteration % _A.checkpoint_every == 0:
            if dist.is_master_process():
                checkpoint_manager.step(iteration)

            # All processes will wait till master process is done serializing.
            dist.synchronize()

            torch.set_grad_enabled(False)
            model.eval()

            predictions: List[Dict[str, Any]] = []
            if dist.is_master_process():
                pbar = tqdm(total=len(val_dataloader))
            for val_iteration, val_batch in enumerate(val_dataloader, start=1):
                val_batch = {
                    'image_id': val_batch['image_id'].to(device),
                    'image': val_batch['image'].to(device)
                }
                output_dict = model(val_batch)

                for image_id, caption in zip(val_batch['image_id'],
                                             output_dict['predictions'][:,
                                                                        1:]):
                    predictions.append({
                        'image_id':
                        image_id.item(),
                        'caption':
                        tokenizer.decode(caption.tolist())
                    })
                if dist.is_master_process():
                    pbar.update(1)
            if dist.is_master_process():
                pbar.close()

            metrics = evaluator_val.evaluate(predictions)
            metrics = {
                k: torch.tensor(v, dtype=torch.float, device=device)
                for k, v in metrics.items()
            }
            dist.average_across_processes(metrics)
            metrics = {k: v.item() for k, v in metrics.items()}

            torch.set_grad_enabled(True)
            model.train()

            if dist.is_master_process():
                logger.info(f"Iteration: {iteration} | Metrics: {metrics}")
                tensorboard_writer.add_scalars("val", metrics, iteration)

        if iteration % _A.checkpoint_every == 0:
            torch.set_grad_enabled(False)
            model.eval()

            batch = next(iter(val_dataloader))
            batch = {"image": batch["image"][:8].to(device)}
            predictions = model(batch)["predictions"].cpu()

            captions = []
            for i in range(predictions.shape[0]):
                caption = tokenizer.decode(predictions[i].tolist())
                captions.append(caption)

            mean = torch.tensor(IMAGENET_COLOR_MEAN,
                                dtype=torch.float).view(1, 3, 1, 1)
            std = torch.tensor(IMAGENET_COLOR_STD,
                               dtype=torch.float).view(1, 3, 1, 1)
            image = batch["image"].cpu() * std + mean

            if dist.is_master_process():
                logger.info(f"Sample Generated Captions:")
                log_text = ""
                for i, caption in enumerate(captions):
                    logger.info(f"\t{caption}")
                    log_text += f"{caption}\n\n"
                tensorboard_writer.add_text(f"samples_itr{iteration}",
                                            log_text, iteration)
                tensorboard_writer.add_images(f"samples_itr{iteration}", image,
                                              iteration)

            torch.set_grad_enabled(True)
            model.train()
Esempio n. 5
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
    else:
        # Get the current device as set for current distributed process.
        # Check `launch` function in `virtex.utils.distributed` module.
        device = torch.cuda.current_device()

    # Create a downstream config object (this will be immutable) and perform
    # common setup such as logging and setting up serialization directory.
    _DOWNC = Config(_A.down_config, _A.down_config_override)
    common_setup(_DOWNC, _A, job_type="downstream")

    # Create a (pretraining) config object and backup in serializaion directory.
    _C = Config(_A.config, _A.config_override)
    _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml"))

    # Get dataset name for tensorboard logging.
    DATASET = _DOWNC.DATA.ROOT.split("/")[-1]

    # Set number of output classes according to dataset:
    NUM_CLASSES_MAPPING = {"imagenet": 1000, "inaturalist": 8142}
    NUM_CLASSES = NUM_CLASSES_MAPPING[DATASET]

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER
    # -------------------------------------------------------------------------
    train_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="train")
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(),
        num_workers=_A.cpu_workers,
        sampler=DistributedSampler(
            train_dataset,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank(),
            shuffle=True,
        ),
        drop_last=False,
        pin_memory=True,
        collate_fn=train_dataset.collate_fn,
    )
    val_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="val")
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(),
        num_workers=_A.cpu_workers,
        sampler=DistributedSampler(
            val_dataset,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank(),
            shuffle=False,
        ),
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )
    # Initialize model using pretraining config.
    pretrained_model = PretrainingModelFactory.from_config(_C)

    # Load weights according to the init method, do nothing for `random`, and
    # `imagenet` is already taken care of.
    if _A.weight_init == "virtex":
        CheckpointManager(model=pretrained_model).load(_A.checkpoint_path)
    elif _A.weight_init == "torchvision":
        # Keep strict=False because this state dict may have weights for
        # last fc layer.
        pretrained_model.visual.cnn.load_state_dict(
            torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"],
            strict=False,
        )

    # Pull out the CNN (torchvision-like) from our pretrained model and add
    # back the FC layer - this is exists in torchvision models, and is set to
    # `nn.Identity()` during pretraining.
    model = pretrained_model.visual.cnn  # type: ignore
    model.fc = nn.Linear(_DOWNC.MODEL.VISUAL.FEATURE_SIZE,
                         NUM_CLASSES).to(device)
    model = model.to(device)

    # Re-initialize the FC layer.
    torch.nn.init.normal_(model.fc.weight.data, mean=0.0, std=0.01)
    torch.nn.init.constant_(model.fc.bias.data, 0.0)

    # Freeze all layers except FC as per config param.
    if _DOWNC.MODEL.VISUAL.FROZEN:
        for name, param in model.named_parameters():
            if "fc" not in name:
                param.requires_grad = False

    # Cross entropy loss and accuracy meter.
    criterion = nn.CrossEntropyLoss()
    top1 = TopkAccuracy(top_k=1)

    optimizer = OptimizerFactory.from_config(_DOWNC, model.named_parameters())
    scheduler = LRSchedulerFactory.from_config(_DOWNC, optimizer)
    del pretrained_model

    # -------------------------------------------------------------------------
    #  BEFORE TRAINING STARTS
    # -------------------------------------------------------------------------

    # Create an iterator from dataloader to sample batches perpetually.
    train_dataloader_iter = cycle(train_dataloader, device)

    # Wrap model and optimizer using NVIDIA Apex for mixed precision training.
    # NOTE: Always do this before wrapping model with DistributedDataParallel.
    if _DOWNC.FP16_OPT > 0:
        from apex import amp

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=f"O{_DOWNC.FP16_OPT}")

    if dist.get_world_size() > 1:
        dist.synchronize()
        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[device], find_unused_parameters=True)

    if dist.is_master_process():
        checkpoint_manager = CheckpointManager(
            _A.serialization_dir,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
        )
        tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)

    # Keep track of time per iteration and ETA.
    timer = Timer(start_from=1, total_iterations=_DOWNC.OPTIM.NUM_ITERATIONS)

    # -------------------------------------------------------------------------
    #   TRAINING LOOP
    # -------------------------------------------------------------------------
    for iteration in range(1, _DOWNC.OPTIM.NUM_ITERATIONS + 1):
        timer.tic()
        optimizer.zero_grad()
        batch = next(train_dataloader_iter)

        logits = model(batch["image"])
        loss = criterion(logits, batch["label"])

        # Perform dynamic scaling of loss to adjust for mixed precision.
        if _DOWNC.FP16_OPT > 0:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        optimizer.step()
        scheduler.step(iteration)
        timer.toc()

        if iteration % _A.log_every == 0 and dist.is_master_process():
            logger.info(
                f"{timer.stats} | Loss: {loss:.3f} | GPU: {dist.gpu_mem_usage()} MB"
            )
            tensorboard_writer.add_scalar(f"{DATASET}/train_loss", loss,
                                          iteration)
            tensorboard_writer.add_scalar(
                f"{DATASET}/learning_rate",
                optimizer.param_groups[0]["lr"],
                iteration,
            )

        # ---------------------------------------------------------------------
        #   VALIDATION
        # ---------------------------------------------------------------------
        if iteration % _A.checkpoint_every == 0:
            torch.set_grad_enabled(False)
            model.eval()

            total_val_loss = torch.tensor(0.0).to(device)

            for val_iteration, batch in enumerate(val_dataloader, start=1):
                for key in batch:
                    batch[key] = batch[key].to(device)

                logits = model(batch["image"])
                loss = criterion(logits, batch["label"])
                top1(logits, batch["label"])
                total_val_loss += loss

            # Divide each loss component by number of val batches per GPU.
            total_val_loss = total_val_loss / val_iteration
            dist.average_across_processes(total_val_loss)

            # Get accumulated Top-1 accuracy for logging across GPUs.
            acc = top1.get_metric(reset=True)
            dist.average_across_processes(acc)

            torch.set_grad_enabled(True)
            model.train()

            # Save recent checkpoint and best checkpoint based on accuracy.
            if dist.is_master_process():
                checkpoint_manager.step(iteration)

        if iteration % _A.checkpoint_every == 0 and dist.is_master_process():
            logger.info(f"Iter: {iteration} | Top-1 accuracy: {acc})")
            tensorboard_writer.add_scalar(f"{DATASET}/val_loss",
                                          total_val_loss, iteration)
            # This name scoping will result in Tensorboard displaying all metrics
            # (VOC07, caption, etc.) together.
            tensorboard_writer.add_scalars(f"metrics/{DATASET}", {"top1": acc},
                                           iteration)

        # All processes will wait till master process is done logging.
        dist.synchronize()
Esempio n. 6
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device: Any = torch.device("cpu")
    else:
        # Get the current device as set for current distributed process.
        # Check `launch` function in `virtex.utils.distributed` module.
        device = torch.cuda.current_device()

    # Create a config object (this will be immutable) and perform common setup
    # such as logging and setting up serialization directory.
    _C = Config(_A.config, _A.config_override)
    common_setup(_C, _A)

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER
    # -------------------------------------------------------------------------
    train_dataset = PretrainingDatasetFactory.from_config(_C, split="train")
    val_dataset = PretrainingDatasetFactory.from_config(_C, split="val")

    # Make `DistributedSampler`s to shard datasets across GPU processes.
    # Skip this if training on CPUs.
    train_sampler = (
        DistributedSampler(train_dataset, shuffle=True)  # type: ignore
        if _A.num_gpus_per_machine > 0 else None)
    val_sampler = (
        DistributedSampler(val_dataset, shuffle=False)  # type: ignore
        if _A.num_gpus_per_machine > 0 else None)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=train_sampler,
        shuffle=train_sampler is None,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=True,
        collate_fn=train_dataset.collate_fn,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=val_sampler,
        shuffle=False,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )

    model = PretrainingModelFactory.from_config(_C).to(device)
    optimizer = OptimizerFactory.from_config(_C, model.named_parameters())
    scheduler = LRSchedulerFactory.from_config(_C, optimizer)

    # -------------------------------------------------------------------------
    #   BEFORE TRAINING STARTS
    # -------------------------------------------------------------------------

    # Create a gradient scaler for automatic mixed precision.
    scaler = amp.GradScaler(enabled=_C.AMP)

    # Load checkpoint to resume training if specified.
    if _A.resume_from is not None:
        start_iteration = CheckpointManager(
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            scaler=scaler,
        ).load(_A.resume_from)
    else:
        start_iteration = 0

    # Create an iterator from dataloader to sample batches perpetually.
    train_dataloader_iter = cycle(train_dataloader, device, start_iteration)

    # Wrap model in DDP if using more than one processes.
    if dist.get_world_size() > 1:
        dist.synchronize()
        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[device], find_unused_parameters=True)

    # Keep track of time per iteration and ETA.
    timer = Timer(start_from=start_iteration + 1,
                  total_iterations=_C.OPTIM.NUM_ITERATIONS)
    # Create tensorboard writer and checkpoint manager (only in master process).
    if dist.is_master_process():
        tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
        tensorboard_writer.add_text("config", f"```\n{_C}\n```")

        checkpoint_manager = CheckpointManager(
            _A.serialization_dir,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            scaler=scaler,
        )

    # -------------------------------------------------------------------------
    #   TRAINING LOOP
    # -------------------------------------------------------------------------
    for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1):
        timer.tic()
        optimizer.zero_grad()
        batch = next(train_dataloader_iter)

        with amp.autocast(enabled=_C.AMP):
            output_dict = model(batch)
            loss = output_dict["loss"]

        scaler.scale(loss).backward()

        # First clip norm of gradients, and then perform optimizer step.
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       _C.OPTIM.CLIP_GRAD_NORM)
        scaler.step(optimizer)

        scaler.update()
        scheduler.step()
        timer.toc()

        # ---------------------------------------------------------------------
        #   LOGGING
        # ---------------------------------------------------------------------
        if iteration % _A.log_every == 0:
            logger.info(
                f"{timer.stats} [Loss {loss:.3f}] [GPU {dist.gpu_mem_usage()} MB]"
            )
            if dist.is_master_process():
                tensorboard_writer.add_scalars(
                    "learning_rate",
                    {
                        "visual": optimizer.param_groups[0]["lr"],
                        "common": optimizer.param_groups[-1]["lr"],
                    },
                    iteration,
                )
                tensorboard_writer.add_scalars("train",
                                               output_dict["loss_components"],
                                               iteration)

        # ---------------------------------------------------------------------
        #   VALIDATION
        # ---------------------------------------------------------------------
        if iteration % _A.checkpoint_every == 0:
            if dist.is_master_process():
                checkpoint_manager.step(iteration)

            # All processes will wait till master process is done serializing.
            dist.synchronize()

            torch.set_grad_enabled(False)
            model.eval()

            # Accumulate different val loss components according to the type of
            # pretraining model.
            val_loss_counter: Counter = Counter()

            for val_iteration, val_batch in enumerate(val_dataloader, start=1):
                for key in val_batch:
                    val_batch[key] = val_batch[key].to(device)
                output_dict = model(val_batch)

                val_loss_counter.update(output_dict["loss_components"])

            # Divide each loss component by number of val batches per GPU.
            val_loss_dict = {
                k: v / val_iteration
                for k, v in dict(val_loss_counter).items()
            }
            dist.average_across_processes(val_loss_dict)
            torch.set_grad_enabled(True)
            model.train()

            logger.info(f"Iteration: {iteration} [Val loss: {val_loss_dict}]")
            if dist.is_master_process():
                tensorboard_writer.add_scalars("val", val_loss_dict, iteration)
Esempio n. 7
0
def main(_A: argparse.Namespace):
    apex = False
    is_cpu = False
    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
        is_cpu = True
    else:
        # Get the current device as set for current distributed process.
        # Check `launch` function in `virtex.utils.distributed` module.
        device = torch.cuda.current_device()

    # Create a config object (this will be immutable) and perform common setup
    # such as logging and setting up serialization directory.
    _C = Config(_A.config, _A.config_override)
    common_setup(_C, _A)

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER
    # -------------------------------------------------------------------------
    tokenizer = TokenizerFactory.from_config(_C)
    train_dataset = PretrainingDatasetFactory.from_config(_C,
                                                          split="train",
                                                          csv=_A.train_csv)
    val_dataset = PretrainingDatasetFactory.from_config(_C,
                                                        split="val",
                                                        csv=_A.val_csv)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        #sampler= Sampler(train_dataset),
        sampler=DistributedSampler(train_dataset, shuffle=True),
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=True,
        collate_fn=train_dataset.collate_fn,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        # sampler = Sampler(val_dataset),
        sampler=DistributedSampler(val_dataset, shuffle=False),
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )

    model = PretrainingModelFactory.from_config(_C).to(device)
    optimizer = OptimizerFactory.from_config(_C, model.named_parameters())
    scheduler = LRSchedulerFactory.from_config(_C, optimizer)

    # -------------------------------------------------------------------------
    #   BEFORE TRAINING STARTS
    # -------------------------------------------------------------------------

    # Load checkpoint to resume training if specified.
    if _A.resume_from is not None:
        start_iteration = CheckpointManager(model=model,
                                            optimizer=optimizer,
                                            scheduler=scheduler).load(
                                                _A.resume_from)
    else:
        start_iteration = 0

    # Keep track of time per iteration and ETA.
    timer = Timer(
        start_from=start_iteration + 1,
        total_iterations=_C.OPTIM.NUM_ITERATIONS,
    )
    # Create an iterator from dataloader to sample batches perpetually.
    train_dataloader_iter = cycle(train_dataloader, device, start_iteration)

    if (not is_cpu):
        # Wrap model and optimizer using NVIDIA Apex for mixed precision training.
        # NOTE: Always do this before wrapping model with DistributedDataParallel.
        if apex:
            if _C.FP16_OPT > 0:
                from apex import amp

                model, optimizer = amp.initialize(model,
                                                  optimizer,
                                                  opt_level=f"O{_C.FP16_OPT}")

        # Wrap model in DDP if using more than one processes.
        if dist.get_world_size() > 1:
            dist.synchronize()
            model = nn.parallel.DistributedDataParallel(
                model, device_ids=[device], find_unused_parameters=True)

        # Create checkpoint manager and tensorboard writer (only in master process).
        if dist.is_master_process():
            checkpoint_manager = CheckpointManager(
                _A.serialization_dir,
                model=model,
                optimizer=optimizer,
                scheduler=scheduler,
            )
            tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
            tensorboard_writer.add_text("config", f"```\n{_C}\n```")

    # -------------------------------------------------------------------------
    #   TRAINING LOOP
    # -------------------------------------------------------------------------
    for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1):
        timer.tic()
        optimizer.zero_grad()

        batch_loss = torch.tensor(0.0, device=device)

        batch = next(train_dataloader_iter)
        output_dict = model(batch)

        loss = output_dict["loss"]
        batch_loss += loss.item()

        # Perform dynamic scaling of loss to adjust for mixed precision.
        if apex and _C.FP16_OPT > 0:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        # Clip norm of gradients before optimizer step.
        torch.nn.utils.clip_grad_norm_(
            amp.master_params(optimizer)
            if apex and _C.FP16_OPT > 0 else model.parameters(),
            _C.OPTIM.CLIP_GRAD_NORM,
        )
        optimizer.step()
        scheduler.step(iteration)
        timer.toc()

        # ---------------------------------------------------------------------
        #   TENSORBOARD LOGGING
        # ---------------------------------------------------------------------
        if iteration % _A.log_every == 0 and dist.is_master_process():
            logger.info(f"{timer.stats} | Loss: {batch_loss:.3f} | "
                        f"GPU mem: {dist.gpu_mem_usage()} MB")
            tensorboard_writer.add_scalars(
                "learning_rate",
                {
                    "visual": optimizer.param_groups[0]["lr"],
                    "common": optimizer.param_groups[-1]["lr"],
                },
                iteration,
            )
            tensorboard_writer.add_scalars("train",
                                           output_dict["loss_components"],
                                           iteration)

        # ---------------------------------------------------------------------
        #   VALIDATION
        # ---------------------------------------------------------------------
        if iteration % _A.checkpoint_every == 0:
            if dist.is_master_process():
                checkpoint_manager.step(iteration)

            torch.set_grad_enabled(False)
            model.eval()

            # Accumulate different val loss components according to the type of
            # pretraining model.
            val_loss_counter: Counter = Counter()

            for val_iteration, val_batch in enumerate(val_dataloader, start=1):
                for key in val_batch:
                    val_batch[key] = val_batch[key].to(device)
                output_dict = model(val_batch)

                val_loss_counter.update(output_dict["loss_components"])

            # Divide each loss component by number of val batches per GPU.
            val_loss_dict = {
                k: v / val_iteration
                for k, v in dict(val_loss_counter).items()
            }
            dist.average_across_processes(val_loss_dict)
            torch.set_grad_enabled(True)
            model.train()

        if iteration % _A.checkpoint_every == 0 and dist.is_master_process():
            logger.info(f"Iter: {iteration} | Val loss: {val_loss_dict}")
            tensorboard_writer.add_scalars("val", val_loss_dict, iteration)

        # All processes will wait till master process is done logging.
        dist.synchronize()
Esempio n. 8
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
    else:
        # Get the current device (this will be zero here by default).
        device = torch.cuda.current_device()

    _C = Config(_A.config, _A.config_override)

    tokenizer = TokenizerFactory.from_config(_C)

    if _A.data_root is None:
        _A.data_root = os.path.join(_C.DATA.ROOT, "val2017")

    val_dataset = PretrainingDatasetFactory.from_config(_C, split='val', all_captions=True)
    val_dataset_no_image = PretrainingDatasetFactory.from_config(_C, split='val', all_captions=True, include_image=False)

    val_sampler = (
        DistributedSampler(val_dataset, shuffle=False)
        if _A.num_gpus_per_machine > 0
        else None
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=val_sampler,
        shuffle=False,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn
    )

    val_dataloader_no_image = DataLoader(
        val_dataset_no_image,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        shuffle=False,
        drop_last=False,
        collate_fn=val_dataset.collate_fn
    )

    evaluator = CiderEvaluator(val_dataloader_no_image, prefix='val')

    # Initialize model from a checkpoint.
    model = PretrainingModelFactory.from_config(_C).to(device)
    ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path)
    model.eval()

    # Make a list of predictions to evaluate.
    predictions: List[Dict[str, Any]] = []

    if dist.is_master_process():
        pbar = tqdm(total=len(val_dataloader))
    for val_iteration, val_batch in enumerate(val_dataloader, start=1):
        val_batch = {'image_id': val_batch['image_id'].to(device),
                     'image': val_batch['image'].to(device)}
        with torch.no_grad():
            output_dict = model(val_batch)

        # Make a dictionary of predictions in COCO format.
        for image_id, caption in zip(
            val_batch["image_id"], output_dict["predictions"][:, 1:]
        ):
            predictions.append(
                {
                    # Convert image id to int if possible (mainly for COCO eval).
                    "image_id": image_id.item(),
                    "caption": tokenizer.decode(caption.tolist()),
                }
            )
        if dist.is_master_process():
            pbar.update(1)
    if dist.is_master_process():
        pbar.close()

    # Save predictions as a JSON file if specified.
    if _A.output is not None:
        os.makedirs(os.path.dirname(_A.output), exist_ok=True)
        json.dump(predictions, open(_A.output, "w"))
        logger.info(f"Saved predictions to {_A.output}")

    # Calculate CIDEr and SPICE metrics using ground truth COCO Captions. This
    # should be skipped when running inference on arbitrary images.
    if _A.calc_metrics:

        metrics = evaluator.evaluate(predictions)
        metrics = {k: torch.tensor(v, dtype=torch.float, device=device)
                   for k, v in metrics.items()}
        dist.average_across_processes(metrics)
        metrics = {k: v.item() for k, v in metrics.items()}
        if dist.is_master_process():
            logger.info(f"Iter: {ITERATION} | Metrics: {metrics}")
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
    else:
        # Get the current device (this will be zero here by default).
        device = torch.cuda.current_device()

    _C = Config(_A.config, _A.config_override)

    tokenizer = TokenizerFactory.from_config(_C)

    dataset = PretrainingDatasetFactory.from_config(_C, split='train')

    sampler = (
        DistributedSampler(dataset, shuffle=False)
        if _A.num_gpus_per_machine > 0
        else None
    )
    val_dataloader = DataLoader(
        dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=sampler,
        shuffle=False,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=dataset.collate_fn
    )

    # Initialize model from a checkpoint.
    model = PretrainingModelFactory.from_config(_C).to(device)
    ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path)
    model.eval()
    torch.set_grad_enabled(False)
    model.sample_on()

    captions = dict()
    
    if dist.is_master_process():
        pbar = tqdm(total=len(val_dataloader))
    for val_iteration, val_batch in enumerate(val_dataloader, start=1):
        val_batch = {'image_id': val_batch['image_id'].to(device),
                     'image': val_batch['image'].to(device)}
        predictions = []
        for k in [1]:
            predictions.append(model(val_batch, sample_mode='beam', n_samples_per_image=k)['predictions'][:, 1:])
        max_length = max([p.shape[1] for p in predictions])
        predictions = [torch.cat((p, torch.zeros(p.shape[0], max_length - p.shape[1], device=device)), dim=1) for p in predictions]
        predictions = torch.stack(predictions, dim=1)

        # Make a dictionary of predictions in COCO format.
        for image_id, caption in zip(
            val_batch["image_id"], predictions
        ):
            captions[image_id.item()] = [tokenizer.decode(c.tolist()).strip() for c in caption]
        if dist.is_master_process():
            pbar.update(1)
    if dist.is_master_process():
        pbar.close()

    # Save predictions as a JSON file if specified.
    folder = os.path.dirname(_A.output)
    os.makedirs(folder, exist_ok=True)

    filename = os.path.basename(_A.output)
    filepath = os.path.join(folder, f'{dist.get_rank()}_{filename}')
    
    json.dump(captions, open(filepath, "w"))
    logger.info(f"Saved predictions to {filepath}")
Esempio n. 10
0
def main_worker(gpu, ngpus_per_node, _A):
    global best_acc1
    _A.gpu = gpu
    logger.info(f"Use GPU: {_A.gpu} for training")

    # For multiprocessing distributed training, rank needs to be the
    # global rank among all the processes
    _A.rank = _A.gpu
    dist.init_process_group(
        backend=_A.dist_backend,
        init_method=_A.dist_url,
        world_size=_A.world_size,
        rank=_A.rank,
    )
    # Create model (pretrained or random init).
    model = models.resnet50(pretrained=True) if _A.pretrained else models.resnet50()

    # For multiprocessing distributed, DistributedDataParallel constructor
    # should always set the single device scope, otherwise,
    # DistributedDataParallel will use all available devices.
    torch.cuda.set_device(_A.gpu)
    model.cuda(_A.gpu)

    # When using a single GPU per process and per
    # DistributedDataParallel, we need to divide the batch size
    # ourselves based on the total number of GPUs we have
    _A.batch_size = int(_A.batch_size / ngpus_per_node)
    _A.workers = int((_A.workers + ngpus_per_node - 1) / ngpus_per_node)
    model = nn.parallel.DistributedDataParallel(model, device_ids=[_A.gpu])

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(_A.gpu)

    optimizer = optim.SGD(
        model.parameters(), _A.lr, momentum=_A.momentum, weight_decay=_A.weight_decay
    )

    # optionally resume from a checkpoint
    if _A.resume:
        if os.path.isfile(_A.resume):
            logger.info(f"=> loading checkpoint '{_A.resume}'")

            # Map model to be loaded to specified single gpu.
            checkpoint = torch.load(_A.resume, map_location=f"cuda:{_A.gpu}")
            _A.start_epoch = checkpoint["epoch"]
            best_acc1 = checkpoint["best_acc1"]

            # best_acc1 may be from a checkpoint from a different GPU
            best_acc1 = best_acc1.to(_A.gpu)
            model.load_state_dict(checkpoint["state_dict"])
            optimizer.load_state_dict(checkpoint["optimizer"])
            logger.info(
                f"=> loaded checkpoint '{_A.resume}' (epoch {checkpoint['epoch']})"
            )
        else:
            logger.info(f"=> no checkpoint found at '{_A.resume}'")

    cudnn.benchmark = True

    # -------------------------------------------------------------------------
    # We modify the data loading code to use our ImageNet dataset class and
    # transforms from albumentations (however, transformation steps are same).
    # -------------------------------------------------------------------------
    train_dataset = ImageNetDataset(
        root=_A.data, split="train", percentage=_A.data_percentage
    )
    logger.info(f"Size of dataset: {len(train_dataset)}")
    val_dataset = ImageNetDataset(root=_A.data, split="val")
    # Val dataset is used sparsely, don't keep it around in memory by caching.

    normalize = alb.Normalize(
        mean=(0.485, 0.456, 0.406),
        std=(0.229, 0.224, 0.225),
        max_pixel_value=1.0,
        always_apply=True,
    )
    # Override image transform (class definition has transform according to
    # downstream linear classification protocol).
    # fmt: off
    train_dataset.image_transform = alb.Compose([
        alb.RandomResizedCrop(224, 224, always_apply=True),
        alb.HorizontalFlip(p=0.5),
        alb.ToFloat(max_value=255.0, always_apply=True),
        normalize,
    ])
    val_dataset.image_transform = alb.Compose([
        alb.Resize(256, 256, always_apply=True),
        alb.CenterCrop(224, 224, always_apply=True),
        alb.ToFloat(max_value=255.0, always_apply=True),
        normalize,
    ])
    train_sampler = DistributedSampler(train_dataset, shuffle=True)
    val_sampler = DistributedSampler(val_dataset)
    train_loader = DataLoader(
        train_dataset,
        batch_size=_A.batch_size,
        num_workers=_A.workers,
        pin_memory=True,
        sampler=train_sampler,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=_A.batch_size,
        num_workers=_A.workers,
        pin_memory=True,
        sampler=val_sampler,
    )
    # fmt: on
    # -------------------------------------------------------------------------

    # Keep track of time per iteration and ETA.
    timer = Timer(start_from=0, total_iterations=_A.epochs * len(train_loader))

    writer = SummaryWriter(log_dir=_A.serialization_dir)
    for epoch in range(_A.start_epoch, _A.epochs):
        train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, _A)

        train(train_loader, model, criterion, optimizer, epoch, timer, writer, _A)
        acc1 = validate(val_loader, model, criterion, writer, _A)

        # Remember best top-1 accuracy and save checkpoint.
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if vdist.is_master_process():
            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "state_dict": model.state_dict(),
                    "best_acc1": best_acc1,
                    "optimizer": optimizer.state_dict(),
                },
                is_best,
                _A.serialization_dir,
            )