Ejemplo n.º 1
0
def main(_A: argparse.Namespace):

    # Get the current device as set for current distributed process.
    # Check `launch` function in `virtex.utils.distributed` module.
    device = torch.cuda.current_device()

    # Local process group is needed for detectron2.
    pg = list(range(dist.get_world_size()))
    d2.utils.comm._LOCAL_PROCESS_GROUP = torch.distributed.new_group(pg)

    # Create a config object (this will be immutable) and perform common setup
    # such as logging and setting up serialization directory.
    import pdb
    pdb.set_trace()
    if _A.weight_init == "imagenet":
        _A.config_override.extend(["MODEL.VISUAL.PRETRAINED", True])
    _C = Config(_A.config, _A.config_override)

    # We use `default_setup` from detectron2 to do some common setup, such as
    # logging, setting up serialization etc. For more info, look into source.
    _D2C = build_detectron2_config(_C, _A)
    default_setup(_D2C, _A)

    # Prepare weights to pass in instantiation call of trainer.
    if _A.weight_init in {"virtex", "torchvision"}:
        if _A.resume:
            # If resuming training, let detectron2 load weights by providing path.
            model = None
            weights = _A.checkpoint_path
        else:
            # Load backbone weights from VirTex pretrained checkpoint.
            model = PretrainingModelFactory.from_config(_C)
            if _A.weight_init == "virtex":
                CheckpointManager(model=model).load(_A.checkpoint_path)
            else:
                model.visual.cnn.load_state_dict(
                    torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"],
                    strict=False,
                )
            weights = model.visual.detectron2_backbone_state_dict()
    else:
        # If random or imagenet init, just load weights after initializing model.
        model = PretrainingModelFactory.from_config(_C)
        weights = model.visual.detectron2_backbone_state_dict()

    # Back up pretrain config and model checkpoint (if provided).
    _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml"))
    if _A.weight_init == "virtex" and not _A.resume:
        torch.save(
            model.state_dict(),
            os.path.join(_A.serialization_dir, "pretrain_model.pth"),
        )

    del model
    trainer = DownstreamTrainer(_D2C, weights)
    trainer.test() if _A.eval_only else trainer.train()
Ejemplo n.º 2
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
    else:
        # Get the current device (this will be zero here by default).
        device = torch.cuda.current_device()

    _C = Config(_A.config, _A.config_override)

    tokenizer = TokenizerFactory.from_config(_C)

    if _A.data_root is None:
        _A.data_root = os.path.join(_C.DATA.ROOT, "val2017")

    val_dataloader = DataLoader(
        ImageDirectoryDataset(_A.data_root),
        batch_size=_C.OPTIM.BATCH_SIZE,
        num_workers=_A.cpu_workers,
        pin_memory=True,
    )
    # Initialize model from a checkpoint.
    model = PretrainingModelFactory.from_config(_C).to(device)
    ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path)
    model.eval()

    # Make a list of predictions to evaluate.
    predictions: List[Dict[str, Any]] = []

    for val_iteration, val_batch in enumerate(val_dataloader, start=1):

        val_batch["image"] = val_batch["image"].to(device)
        with torch.no_grad():
            output_dict = model(val_batch)

        # Make a dictionary of predictions in COCO format.
        for image_id, caption in zip(val_batch["image_id"],
                                     output_dict["predictions"]):
            predictions.append({
                "image_id": image_id.item(),
                "caption": tokenizer.decode(caption.tolist()),
            })

    # Save predictions as a JSON file if specified.
    if _A.output is not None:
        os.makedirs(os.path.dirname(_A.output), exist_ok=True)
        json.dump(predictions, open(_A.output, "w"))
        logger.info(f"Saved predictions to {_A.output}")

    # Calculate CIDEr and SPICE metrics using ground truth COCO Captions. This
    # should be skipped when running inference on arbitrary images.
    if _A.calc_metrics:
        # Assume ground truth (COCO val2017 annotations) exist.
        gt = os.path.join(_C.DATA.ROOT, "annotations", "captions_val2017.json")

        metrics = CocoCaptionsEvaluator(gt).evaluate(predictions)
        logger.info(f"Iter: {ITERATION} | Metrics: {metrics}")
Ejemplo n.º 3
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
    else:
        # Get the current device (this will be zero here by default).
        device = torch.cuda.current_device()

    _C = Config(_A.config, _A.config_override)

    tokenizer = TokenizerFactory.from_config(_C)
    val_dataloader = DataLoader(
        CocoCaptionsEvalDataset(_C.DATA.ROOT),
        batch_size=_C.OPTIM.BATCH_SIZE,
        num_workers=_A.cpu_workers,
        pin_memory=True,
    )
    # Initialize model from a checkpoint.
    model = PretrainingModelFactory.from_config(_C).to(device)
    ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path)
    model.eval()

    # Make a list of predictions to evaluate.
    predictions: List[Dict[str, Any]] = []

    for val_iteration, val_batch in enumerate(val_dataloader, start=1):
        for key in val_batch:
            val_batch[key] = val_batch[key].to(device)

        # Make a dictionary of predictions in COCO format.
        with torch.no_grad():
            output_dict = model(val_batch)

        for image_id, caption in zip(val_batch["image_id"],
                                     output_dict["predictions"]):
            predictions.append({
                "image_id": image_id.item(),
                "caption": tokenizer.decode(caption.tolist),
            })

    # Assume ground truth (COCO val2017 annotations) exist.
    gt = os.path.join(_C.DATA.ROOT, "annotations", "captions_val2017.json")

    metrics = CocoCaptionsEvaluator(gt).evaluate(predictions)
    logger.info(f"Iter: {ITERATION} | Metrics: {metrics}")
Ejemplo n.º 4
0
def get(config_path: str, pretrained: bool = False):
    r"""
    Get a model specified by relative path under Detectron2's official
    ``configs/`` directory.

    Args:
        config_path: Name of config file relative to ``configs/`` directory
            under project root. (E.g. ``width_ablations/bicaptioning_R_50_L1_H2048.yaml``)
        pretrained: If ``True``, will initialize the model with the pretrained
            weights. If ``False``, the weights will be initialized randomly.
    """

    # Get the original path to config file (shipped with inside the package).
    _pkg_config_path = pkg_resources.resource_filename(
        "virtex.model_zoo", os.path.join("configs", config_path)
    )
    if not os.path.exists(_pkg_config_path):
        raise RuntimeError("{} not available in Model Zoo!".format(config_path))

    _C = Config(_pkg_config_path)
    model = PretrainingModelFactory.from_config(_C)

    if pretrained:
        # Get URL for the checkpoint for this config path.
        if config_path in _ModelZooUrls.CONFIG_PATH_TO_DB_ID:

            dropbox_id = _ModelZooUrls.CONFIG_PATH_TO_DB_ID[config_path]
            filename = os.path.basename(config_path).replace(".yaml", ".pth")

            checkpoint_url = f"{_ModelZooUrls.URL_PREFIX}/{dropbox_id}/{filename}?dl=1"
        else:
            raise RuntimeError("{} not available in Model Zoo!".format(config_path))

        # Download the pretrained model weights and save with a sensible name.
        # This will be downloaded only if it does not exist.
        checkpoint_path = download(
            checkpoint_url,
            dir=os.path.expanduser("~/.torch/virtex_cache"),
            filename=os.path.basename(config_path).replace(".yaml", ".pth")
        )
        CheckpointManager(model=model).load(checkpoint_path)

    return model
Ejemplo n.º 5
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
    else:
        # Get the current device (this will be zero here by default).
        device = torch.cuda.current_device()

    _C = Config(_A.config, _A.config_override)

    tokenizer = TokenizerFactory.from_config(_C)

    if _A.data_root is None:
        _A.data_root = os.path.join(_C.DATA.ROOT, "val2017")

    val_dataloader = DataLoader(
        ImageDirectoryDataset(_A.data_root),
        batch_size=16,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        shuffle=True,
    )
    # Initialize model from a checkpoint.
    model = PretrainingModelFactory.from_config(_C).to(device)
    ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path)
    model.eval()
    model.sample_on()

    val_batch = next(iter(val_dataloader))
    val_batch['image'] = val_batch['image'].to(device)
    with torch.no_grad():
        output_dict = model(val_batch, sample_mode='greedy')

    for caption in output_dict["predictions"][:, 1:]:
        print(tokenizer.decode(caption.tolist()))

    mean = torch.tensor(IMAGENET_COLOR_MEAN,
                        dtype=torch.float).view(1, 3, 1, 1)
    std = torch.tensor(IMAGENET_COLOR_STD, dtype=torch.float).view(1, 3, 1, 1)
    images = val_batch['image'].cpu() * std + mean

    save_image(images, 'images.png', nrow=4)
Ejemplo n.º 6
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
    else:
        # Get the current device (this will be zero here by default).
        device = torch.cuda.current_device()

    _C = Config(_A.config, _A.config_override)

    tokenizer = TokenizerFactory.from_config(_C)

    dataset = PretrainingDatasetFactory.from_config(_C, split='train')

    dataloader = DataLoader(dataset,
                            batch_size=_C.OPTIM.BATCH_SIZE,
                            shuffle=False,
                            num_workers=_A.cpu_workers,
                            drop_last=False,
                            collate_fn=dataset.collate_fn)
    print('dataloader size:', len(dataloader))

    photoids = dict()

    pbar = tqdm(total=len(dataloader))
    for batch in dataloader:
        for image_id, photo_id in zip(batch["image_id"], batch["photo_id"]):
            photoids[image_id.item()] = photo_id.item()
        pbar.update(1)
    pbar.close()

    # Save predictions as a JSON file if specified.
    os.makedirs(os.path.dirname(_A.output), exist_ok=True)
    json.dump(photoids, open(_A.output, "w"))
    logger.info(f"Saved photoids to {_A.output}")
Ejemplo n.º 7
0
def common_setup(_C: Config,
                 _A: argparse.Namespace,
                 job_type: str = "pretrain"):
    r"""
    Setup common stuff at the start of every pretraining or downstream
    evaluation job, all listed here to avoid code duplication. Basic steps:

    1. Fix random seeds and other PyTorch flags.
    2. Set up a serialization directory and loggers.
    3. Log important stuff such as config, process info (useful during
        distributed training).
    4. Save a copy of config to serialization directory.

    .. note::

        It is assumed that multiple processes for distributed training have
        already been launched from outside. Functions from
        :mod:`virtex.utils.distributed` module ae used to get process info.

    Args:
        _C: Config object with all the parameters.
        _A: Argparse command line arguments.
        job_type: Type of job for which setup is to be done; one of
            ``{"pretrain", "downstream"}``.
    """

    # Get process rank and world size (assuming distributed is initialized).
    RANK = dist.get_rank()
    WORLD_SIZE = dist.get_world_size()

    # For reproducibility - refer https://pytorch.org/docs/stable/notes/randomness.html
    torch.manual_seed(_C.RANDOM_SEED)
    torch.backends.cudnn.deterministic = _C.CUDNN_DETERMINISTIC
    torch.backends.cudnn.benchmark = _C.CUDNN_BENCHMARK
    random.seed(_C.RANDOM_SEED)
    np.random.seed(_C.RANDOM_SEED)

    # Create serialization directory and save config in it.
    os.makedirs(_A.serialization_dir, exist_ok=True)
    _C.dump(os.path.join(_A.serialization_dir, f"{job_type}_config.yaml"))

    # Remove default logger, create a logger for each process which writes to a
    # separate log-file. This makes changes in global scope.
    logger.remove(0)
    if dist.get_world_size() > 1:
        logger.add(
            os.path.join(_A.serialization_dir, f"log-rank{RANK}.txt"),
            format="{time} {level} {message}",
        )

    # Add a logger for stdout only for the master process.
    if dist.is_master_process():
        logger.add(sys.stdout,
                   format="<g>{time}</g>: <lvl>{message}</lvl>",
                   colorize=True)

    # Print process info, config and args.
    logger.info(f"Rank of current process: {RANK}. World size: {WORLD_SIZE}")
    logger.info(str(_C))

    logger.info("Command line args:")
    for arg in vars(_A):
        logger.info("{:<20}: {}".format(arg, getattr(_A, arg)))
Ejemplo n.º 8
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
    else:
        # Get the current device (this will be zero here by default).
        device = torch.cuda.current_device()

    # Create a downstream config object (this will be immutable) and perform
    # common setup such as logging and setting up serialization directory.
    _DOWNC = Config(_A.down_config, _A.down_config_override)
    common_setup(_DOWNC, _A, job_type="downstream")

    # Create a (pretraining) config object and backup in serialization directory.
    _C = Config(_A.config, _A.config_override)
    _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml"))

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, AND FEATURE EXTRACTOR
    # -------------------------------------------------------------------------

    train_dataset = DownstreamDatasetFactory.from_config(_DOWNC,
                                                         split="trainval")
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_DOWNC.OPTIM.BATCH_SIZE,
        num_workers=_A.cpu_workers,
        pin_memory=True,
    )
    test_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="test")
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=_DOWNC.OPTIM.BATCH_SIZE,
        num_workers=_A.cpu_workers,
        pin_memory=True,
    )
    NUM_CLASSES = len(train_dataset.class_names)

    # Initialize from a checkpoint, but only keep the visual module.
    model = PretrainingModelFactory.from_config(_C)

    # Load weights according to the init method, do nothing for `random`, and
    # `imagenet` is already taken care of.
    if _A.weight_init == "virtex":
        ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path)
    elif _A.weight_init == "torchvision":
        # Keep strict=False because this state dict may have weights for
        # last fc layer.
        model.visual.cnn.load_state_dict(
            torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"],
            strict=False,
        )

    model = FeatureExtractor(model,
                             layer_name=_A.layer,
                             flatten_and_normalize=True)
    model = model.to(device).eval()

    # -------------------------------------------------------------------------
    #   EXTRACT FEATURES FOR TRAINING SVMs
    # -------------------------------------------------------------------------

    features_train: List[torch.Tensor] = []
    targets_train: List[torch.Tensor] = []

    features_test: List[torch.Tensor] = []
    targets_test: List[torch.Tensor] = []

    # VOC07 is small, extract all features and keep them in memory.
    with torch.no_grad():
        for batch in tqdm(train_dataloader, desc="Extracting train features:"):
            features = model(batch["image"].to(device))

            features_train.append(features.cpu())
            targets_train.append(batch["label"])

        # Similarly extract test features.
        for batch in tqdm(test_dataloader, desc="Extracting test features:"):
            features = model(batch["image"].to(device))

            features_test.append(features.cpu())
            targets_test.append(batch["label"])

    # Convert batches of features/targets to one large numpy array
    features_train = torch.cat(features_train, dim=0).numpy()
    targets_train = torch.cat(targets_train, dim=0).numpy().astype(np.int32)

    features_test = torch.cat(features_test, dim=0).numpy()
    targets_test = torch.cat(targets_test, dim=0).numpy().astype(np.int32)

    # -------------------------------------------------------------------------
    #   TRAIN AND TEST SVMs WITH EXTRACTED FEATURES
    # -------------------------------------------------------------------------

    input_args: List[Any] = []

    # Iterate over all VOC07 classes and train one-vs-all linear SVMs.
    for cls_idx in range(NUM_CLASSES):
        # fmt: off
        input_args.append((
            features_train,
            targets_train[:, cls_idx],
            features_test,
            targets_test[:, cls_idx],
            train_dataset.class_names[cls_idx],
        ))
        # fmt: on

    pool = mp.Pool(processes=_A.cpu_workers)
    pool_output = pool.map(train_test_single_svm, input_args)

    # -------------------------------------------------------------------------
    #   TENSORBOARD LOGGING (RELEVANT MAINLY FOR weight_init=checkpoint)
    # -------------------------------------------------------------------------

    # Tensorboard writer for logging mAP scores. This is useful especially
    # when weight_init=checkpoint (which maybe be coming from a training job).
    tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)

    # Test set mAP for each class, for features from every layer.
    test_map = torch.tensor(pool_output).mean()
    logger.info(f"mAP: {test_map}")

    # Tensorboard logging only when _A.weight_init == "virtex"
    if _A.weight_init == "virtex":
        tensorboard_writer.add_scalars("metrics/voc07_clf", {"mAP": test_map},
                                       ITERATION)
Ejemplo n.º 9
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device: Any = torch.device("cpu")
    else:
        # Get the current device as set for current distributed process.
        # Check `launch` function in `virtex.utils.distributed` module.
        device = torch.cuda.current_device()

    # Create a config object (this will be immutable) and perform common setup
    # such as logging and setting up serialization directory.
    _C = Config(_A.config, _A.config_override)
    common_setup(_C, _A)

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER
    # -------------------------------------------------------------------------
    train_dataset = PretrainingDatasetFactory.from_config(_C, split="train")
    val_dataset = PretrainingDatasetFactory.from_config(_C,
                                                        split="val",
                                                        all_captions=True)
    train_dataset_no_image = PretrainingDatasetFactory.from_config(
        _C, split="train", all_captions=True, include_image=False)
    val_dataset_no_image = PretrainingDatasetFactory.from_config(
        _C, split="val", all_captions=True, include_image=False)

    # Make `DistributedSampler`s to shard datasets across GPU processes.
    # Skip this if training on CPUs.
    train_sampler = (
        DistributedSampler(train_dataset, shuffle=True)  # type: ignore
        if _A.num_gpus_per_machine > 0 else None)
    val_sampler = (
        DistributedSampler(val_dataset, shuffle=False)  # type: ignore
        if _A.num_gpus_per_machine > 0 else None)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=train_sampler,
        shuffle=train_sampler is None,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=True,
        collate_fn=train_dataset.collate_fn,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=val_sampler,
        shuffle=False,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )

    train_dataloader_no_image = DataLoader(
        train_dataset_no_image,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        shuffle=False,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )
    evaluator = CiderEvaluator(train_dataloader_no_image, prefix='train')

    val_dataloader_no_image = DataLoader(
        val_dataset_no_image,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        shuffle=False,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )
    evaluator_val = CiderEvaluator(val_dataloader_no_image, prefix='val')

    # Load supervised trained model
    model = PretrainingModelFactory.from_config(_C).to(device)
    CheckpointManager(model=model).load(_A.start_checkpoint)

    optimizer = OptimizerFactory.from_config(_C, model.named_parameters())
    scheduler = LRSchedulerFactory.from_config(_C, optimizer)

    if dist.is_master_process():
        print(
            'total parameters:',
            sum([
                np.prod(p.shape) for p in model.parameters() if p.requires_grad
            ]))
        print(
            f'train data: {len(train_dataloader)}, val data: {len(val_dataloader)}'
        )

    tokenizer = train_dataset.tokenizer

    # -------------------------------------------------------------------------
    #   BEFORE TRAINING STARTS
    # -------------------------------------------------------------------------

    # Create a gradient scaler for automatic mixed precision.
    scaler = amp.GradScaler(enabled=_C.AMP)

    # Load checkpoint to resume training if specified.
    if _A.resume_from is not None:
        start_iteration = CheckpointManager(
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            scaler=scaler,
        ).load(_A.resume_from)
    else:
        start_iteration = 0

    # Create an iterator from dataloader to sample batches perpetually.
    train_dataloader_iter = cycle(train_dataloader, device, start_iteration)

    # Wrap model in DDP if using more than one processes.
    if dist.get_world_size() > 1:
        dist.synchronize()
        model = dist.DistributedDataParallel(model,
                                             device_ids=[device],
                                             find_unused_parameters=False)

    # Keep track of time per iteration and ETA.
    timer = Timer(start_from=start_iteration + 1,
                  total_iterations=_C.OPTIM.NUM_ITERATIONS)
    # Create tensorboard writer and checkpoint manager (only in master process).
    if dist.is_master_process():
        tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
        tensorboard_writer.add_text("config", f"```\n{_C}\n```")

        checkpoint_manager = CheckpointManager(
            _A.serialization_dir,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            scaler=scaler,
        )

    # -------------------------------------------------------------------------
    #   TRAINING LOOP
    # -------------------------------------------------------------------------
    for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1):
        timer.tic()
        optimizer.zero_grad()
        batch = next(train_dataloader_iter)

        with amp.autocast(enabled=_C.AMP):
            model.sample_on()
            model.eval()
            with torch.no_grad():
                greedy_dec = model({"image": batch["image"]},
                                   sample_mode="greedy")['predictions']
                out = model({"image": batch["image"]},
                            sample_mode="sample",
                            n_samples_per_image=5)
                sample_dec, caption_lengths = out['predictions'], out[
                    'caption_lengths']
            model.train()
            model.sample_off()

            sample_log_probs = -model(
                {
                    "image": batch["image"],
                    "caption_tokens": sample_dec,
                    "caption_lengths": caption_lengths
                },
                loss_reduction='none')['loss']

            image_ids = batch['image_id'].tolist()
            reward = compute_scts_reward(image_ids, greedy_dec[:, 1:],
                                         sample_dec[:,
                                                    1:], tokenizer, evaluator)
            reward = torch.from_numpy(reward).to(device)

            mask = sample_dec[:, 1:] != tokenizer.pad_id
            loss = -sample_log_probs * reward * mask
            loss = loss.sum() / mask.sum()
        scaler.scale(loss).backward()

        # First clip norm of gradients, and then perform optimizer step.
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       _C.OPTIM.CLIP_GRAD_NORM)
        scaler.step(optimizer)

        scaler.update()
        scheduler.step()
        timer.toc()

        # ---------------------------------------------------------------------
        #   LOGGING
        # ---------------------------------------------------------------------
        if iteration % _A.log_every == 0:
            logger.info(
                f"{timer.stats} [Reward {-loss:.3f}] [GPU {dist.gpu_mem_usage()} MB]"
            )
            if dist.is_master_process():
                tensorboard_writer.add_scalars(
                    "learning_rate",
                    {
                        "visual": optimizer.param_groups[0]["lr"],
                        "common": optimizer.param_groups[-1]["lr"],
                    },
                    iteration,
                )

        # ---------------------------------------------------------------------
        #   VALIDATION
        # ---------------------------------------------------------------------
        if iteration % _A.checkpoint_every == 0:
            if dist.is_master_process():
                checkpoint_manager.step(iteration)

            # All processes will wait till master process is done serializing.
            dist.synchronize()

            torch.set_grad_enabled(False)
            model.eval()

            predictions: List[Dict[str, Any]] = []
            if dist.is_master_process():
                pbar = tqdm(total=len(val_dataloader))
            for val_iteration, val_batch in enumerate(val_dataloader, start=1):
                val_batch = {
                    'image_id': val_batch['image_id'].to(device),
                    'image': val_batch['image'].to(device)
                }
                output_dict = model(val_batch)

                for image_id, caption in zip(val_batch['image_id'],
                                             output_dict['predictions'][:,
                                                                        1:]):
                    predictions.append({
                        'image_id':
                        image_id.item(),
                        'caption':
                        tokenizer.decode(caption.tolist())
                    })
                if dist.is_master_process():
                    pbar.update(1)
            if dist.is_master_process():
                pbar.close()

            metrics = evaluator_val.evaluate(predictions)
            metrics = {
                k: torch.tensor(v, dtype=torch.float, device=device)
                for k, v in metrics.items()
            }
            dist.average_across_processes(metrics)
            metrics = {k: v.item() for k, v in metrics.items()}

            torch.set_grad_enabled(True)
            model.train()

            if dist.is_master_process():
                logger.info(f"Iteration: {iteration} | Metrics: {metrics}")
                tensorboard_writer.add_scalars("val", metrics, iteration)

        if iteration % _A.checkpoint_every == 0:
            torch.set_grad_enabled(False)
            model.eval()

            batch = next(iter(val_dataloader))
            batch = {"image": batch["image"][:8].to(device)}
            predictions = model(batch)["predictions"].cpu()

            captions = []
            for i in range(predictions.shape[0]):
                caption = tokenizer.decode(predictions[i].tolist())
                captions.append(caption)

            mean = torch.tensor(IMAGENET_COLOR_MEAN,
                                dtype=torch.float).view(1, 3, 1, 1)
            std = torch.tensor(IMAGENET_COLOR_STD,
                               dtype=torch.float).view(1, 3, 1, 1)
            image = batch["image"].cpu() * std + mean

            if dist.is_master_process():
                logger.info(f"Sample Generated Captions:")
                log_text = ""
                for i, caption in enumerate(captions):
                    logger.info(f"\t{caption}")
                    log_text += f"{caption}\n\n"
                tensorboard_writer.add_text(f"samples_itr{iteration}",
                                            log_text, iteration)
                tensorboard_writer.add_images(f"samples_itr{iteration}", image,
                                              iteration)

            torch.set_grad_enabled(True)
            model.train()
Ejemplo n.º 10
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
    else:
        # Get the current device as set for current distributed process.
        # Check `launch` function in `virtex.utils.distributed` module.
        device = torch.cuda.current_device()

    # Create a downstream config object (this will be immutable) and perform
    # common setup such as logging and setting up serialization directory.
    _DOWNC = Config(_A.down_config, _A.down_config_override)
    common_setup(_DOWNC, _A, job_type="downstream")

    # Create a (pretraining) config object and backup in serializaion directory.
    _C = Config(_A.config, _A.config_override)
    _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml"))

    # Get dataset name for tensorboard logging.
    DATASET = _DOWNC.DATA.ROOT.split("/")[-1]

    # Set number of output classes according to dataset:
    NUM_CLASSES_MAPPING = {"imagenet": 1000, "inaturalist": 8142}
    NUM_CLASSES = NUM_CLASSES_MAPPING[DATASET]

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER
    # -------------------------------------------------------------------------
    train_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="train")
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(),
        num_workers=_A.cpu_workers,
        sampler=DistributedSampler(
            train_dataset,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank(),
            shuffle=True,
        ),
        drop_last=False,
        pin_memory=True,
        collate_fn=train_dataset.collate_fn,
    )
    val_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="val")
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(),
        num_workers=_A.cpu_workers,
        sampler=DistributedSampler(
            val_dataset,
            num_replicas=dist.get_world_size(),
            rank=dist.get_rank(),
            shuffle=False,
        ),
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )
    # Initialize model using pretraining config.
    pretrained_model = PretrainingModelFactory.from_config(_C)

    # Load weights according to the init method, do nothing for `random`, and
    # `imagenet` is already taken care of.
    if _A.weight_init == "virtex":
        CheckpointManager(model=pretrained_model).load(_A.checkpoint_path)
    elif _A.weight_init == "torchvision":
        # Keep strict=False because this state dict may have weights for
        # last fc layer.
        pretrained_model.visual.cnn.load_state_dict(
            torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"],
            strict=False,
        )

    # Pull out the CNN (torchvision-like) from our pretrained model and add
    # back the FC layer - this is exists in torchvision models, and is set to
    # `nn.Identity()` during pretraining.
    model = pretrained_model.visual.cnn  # type: ignore
    model.fc = nn.Linear(_DOWNC.MODEL.VISUAL.FEATURE_SIZE,
                         NUM_CLASSES).to(device)
    model = model.to(device)

    # Re-initialize the FC layer.
    torch.nn.init.normal_(model.fc.weight.data, mean=0.0, std=0.01)
    torch.nn.init.constant_(model.fc.bias.data, 0.0)

    # Freeze all layers except FC as per config param.
    if _DOWNC.MODEL.VISUAL.FROZEN:
        for name, param in model.named_parameters():
            if "fc" not in name:
                param.requires_grad = False

    # Cross entropy loss and accuracy meter.
    criterion = nn.CrossEntropyLoss()
    top1 = TopkAccuracy(top_k=1)

    optimizer = OptimizerFactory.from_config(_DOWNC, model.named_parameters())
    scheduler = LRSchedulerFactory.from_config(_DOWNC, optimizer)
    del pretrained_model

    # -------------------------------------------------------------------------
    #  BEFORE TRAINING STARTS
    # -------------------------------------------------------------------------

    # Create an iterator from dataloader to sample batches perpetually.
    train_dataloader_iter = cycle(train_dataloader, device)

    # Wrap model and optimizer using NVIDIA Apex for mixed precision training.
    # NOTE: Always do this before wrapping model with DistributedDataParallel.
    if _DOWNC.FP16_OPT > 0:
        from apex import amp

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=f"O{_DOWNC.FP16_OPT}")

    if dist.get_world_size() > 1:
        dist.synchronize()
        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[device], find_unused_parameters=True)

    if dist.is_master_process():
        checkpoint_manager = CheckpointManager(
            _A.serialization_dir,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
        )
        tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)

    # Keep track of time per iteration and ETA.
    timer = Timer(start_from=1, total_iterations=_DOWNC.OPTIM.NUM_ITERATIONS)

    # -------------------------------------------------------------------------
    #   TRAINING LOOP
    # -------------------------------------------------------------------------
    for iteration in range(1, _DOWNC.OPTIM.NUM_ITERATIONS + 1):
        timer.tic()
        optimizer.zero_grad()
        batch = next(train_dataloader_iter)

        logits = model(batch["image"])
        loss = criterion(logits, batch["label"])

        # Perform dynamic scaling of loss to adjust for mixed precision.
        if _DOWNC.FP16_OPT > 0:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        optimizer.step()
        scheduler.step(iteration)
        timer.toc()

        if iteration % _A.log_every == 0 and dist.is_master_process():
            logger.info(
                f"{timer.stats} | Loss: {loss:.3f} | GPU: {dist.gpu_mem_usage()} MB"
            )
            tensorboard_writer.add_scalar(f"{DATASET}/train_loss", loss,
                                          iteration)
            tensorboard_writer.add_scalar(
                f"{DATASET}/learning_rate",
                optimizer.param_groups[0]["lr"],
                iteration,
            )

        # ---------------------------------------------------------------------
        #   VALIDATION
        # ---------------------------------------------------------------------
        if iteration % _A.checkpoint_every == 0:
            torch.set_grad_enabled(False)
            model.eval()

            total_val_loss = torch.tensor(0.0).to(device)

            for val_iteration, batch in enumerate(val_dataloader, start=1):
                for key in batch:
                    batch[key] = batch[key].to(device)

                logits = model(batch["image"])
                loss = criterion(logits, batch["label"])
                top1(logits, batch["label"])
                total_val_loss += loss

            # Divide each loss component by number of val batches per GPU.
            total_val_loss = total_val_loss / val_iteration
            dist.average_across_processes(total_val_loss)

            # Get accumulated Top-1 accuracy for logging across GPUs.
            acc = top1.get_metric(reset=True)
            dist.average_across_processes(acc)

            torch.set_grad_enabled(True)
            model.train()

            # Save recent checkpoint and best checkpoint based on accuracy.
            if dist.is_master_process():
                checkpoint_manager.step(iteration)

        if iteration % _A.checkpoint_every == 0 and dist.is_master_process():
            logger.info(f"Iter: {iteration} | Top-1 accuracy: {acc})")
            tensorboard_writer.add_scalar(f"{DATASET}/val_loss",
                                          total_val_loss, iteration)
            # This name scoping will result in Tensorboard displaying all metrics
            # (VOC07, caption, etc.) together.
            tensorboard_writer.add_scalars(f"metrics/{DATASET}", {"top1": acc},
                                           iteration)

        # All processes will wait till master process is done logging.
        dist.synchronize()
Ejemplo n.º 11
0
def main(_A: argparse.Namespace):
    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
    else:
        # Get the current device (this will be zero here by default).
        device = torch.cuda.current_device()

    # Create a downstream config object (this will be immutable) and perform
    # common setup such as logging and setting up serialization directory.
    _DOWNC = Config(_A.down_config, _A.down_config_override)
    common_setup(_DOWNC, _A, job_type="downstream")

    # Create a (pretraining) config object and backup in serialization directory.
    _C = Config(_A.config, _A.config_override)
    _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml"))

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, AND FEATURE EXTRACTOR
    # -------------------------------------------------------------------------
    train_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="trainval", csv=_A.csv)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_DOWNC.OPTIM.BATCH_SIZE,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        shuffle=False,
    )

    print(f"train dataset length {len(train_dataset)}")
    # Initialize from a checkpoint, but only keep the visual module.
    model = PretrainingModelFactory.from_config(_C)

    # Load weights according to the init method, do nothing for `random`, and
    # `imagenet` is already taken care of.
    if _A.weight_init == "virtex":
        ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path)
    elif _A.weight_init == "torchvision":
        # Keep strict=False because this state dict may have weights for
        # last fc layer.
        model.visual.cnn.load_state_dict(
            torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"],
            strict=False,
        )

    model = FeatureExtractor(model, layer_name=_A.layer, flatten_and_normalize=True)
    model = model.to(device).eval()

    # -------------------------------------------------------------------------
    #   EXTRACT FEATURES FOR TRAINING SVMs
    # -------------------------------------------------------------------------

    features_train: List[torch.Tensor] = []
    targets_train: List[torch.Tensor] = []
    print("input csv is {}".format(_A.csv))

    # VOC07 is small, extract all features and keep them in memory.
    count = 0
    with torch.no_grad():
        for batch in tqdm(train_dataloader, desc="Extracting train features:"):
            # if count<=4000:
            #     count+=1
            #     continue
            features = model(batch["image"].to(device))
            count += 1
            print("train features has shape {}, video_id {}".format(features.shape, batch['image_id']))
            if count % 1000 == 0:

                torch.save(features_train, "./features_phoenix/features_{}_{}.pt".format(_A.mode, count))
                features_train = []
            features_train.append(features.cpu())

    torch.save(features_train, "./features_phoenix/features_end_{}.pt".format(_A.mode))
    print('finished saving features')
Ejemplo n.º 12
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device: Any = torch.device("cpu")
    else:
        # Get the current device as set for current distributed process.
        # Check `launch` function in `virtex.utils.distributed` module.
        device = torch.cuda.current_device()

    # Create a config object (this will be immutable) and perform common setup
    # such as logging and setting up serialization directory.
    _C = Config(_A.config, _A.config_override)
    common_setup(_C, _A)

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER
    # -------------------------------------------------------------------------
    train_dataset = PretrainingDatasetFactory.from_config(_C, split="train")
    val_dataset = PretrainingDatasetFactory.from_config(_C, split="val")

    # Make `DistributedSampler`s to shard datasets across GPU processes.
    # Skip this if training on CPUs.
    train_sampler = (
        DistributedSampler(train_dataset, shuffle=True)  # type: ignore
        if _A.num_gpus_per_machine > 0 else None)
    val_sampler = (
        DistributedSampler(val_dataset, shuffle=False)  # type: ignore
        if _A.num_gpus_per_machine > 0 else None)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=train_sampler,
        shuffle=train_sampler is None,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=True,
        collate_fn=train_dataset.collate_fn,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=val_sampler,
        shuffle=False,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )

    model = PretrainingModelFactory.from_config(_C).to(device)
    optimizer = OptimizerFactory.from_config(_C, model.named_parameters())
    scheduler = LRSchedulerFactory.from_config(_C, optimizer)

    # -------------------------------------------------------------------------
    #   BEFORE TRAINING STARTS
    # -------------------------------------------------------------------------

    # Create a gradient scaler for automatic mixed precision.
    scaler = amp.GradScaler(enabled=_C.AMP)

    # Load checkpoint to resume training if specified.
    if _A.resume_from is not None:
        start_iteration = CheckpointManager(
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            scaler=scaler,
        ).load(_A.resume_from)
    else:
        start_iteration = 0

    # Create an iterator from dataloader to sample batches perpetually.
    train_dataloader_iter = cycle(train_dataloader, device, start_iteration)

    # Wrap model in DDP if using more than one processes.
    if dist.get_world_size() > 1:
        dist.synchronize()
        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[device], find_unused_parameters=True)

    # Keep track of time per iteration and ETA.
    timer = Timer(start_from=start_iteration + 1,
                  total_iterations=_C.OPTIM.NUM_ITERATIONS)
    # Create tensorboard writer and checkpoint manager (only in master process).
    if dist.is_master_process():
        tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
        tensorboard_writer.add_text("config", f"```\n{_C}\n```")

        checkpoint_manager = CheckpointManager(
            _A.serialization_dir,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            scaler=scaler,
        )

    # -------------------------------------------------------------------------
    #   TRAINING LOOP
    # -------------------------------------------------------------------------
    for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1):
        timer.tic()
        optimizer.zero_grad()
        batch = next(train_dataloader_iter)

        with amp.autocast(enabled=_C.AMP):
            output_dict = model(batch)
            loss = output_dict["loss"]

        scaler.scale(loss).backward()

        # First clip norm of gradients, and then perform optimizer step.
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       _C.OPTIM.CLIP_GRAD_NORM)
        scaler.step(optimizer)

        scaler.update()
        scheduler.step()
        timer.toc()

        # ---------------------------------------------------------------------
        #   LOGGING
        # ---------------------------------------------------------------------
        if iteration % _A.log_every == 0:
            logger.info(
                f"{timer.stats} [Loss {loss:.3f}] [GPU {dist.gpu_mem_usage()} MB]"
            )
            if dist.is_master_process():
                tensorboard_writer.add_scalars(
                    "learning_rate",
                    {
                        "visual": optimizer.param_groups[0]["lr"],
                        "common": optimizer.param_groups[-1]["lr"],
                    },
                    iteration,
                )
                tensorboard_writer.add_scalars("train",
                                               output_dict["loss_components"],
                                               iteration)

        # ---------------------------------------------------------------------
        #   VALIDATION
        # ---------------------------------------------------------------------
        if iteration % _A.checkpoint_every == 0:
            if dist.is_master_process():
                checkpoint_manager.step(iteration)

            # All processes will wait till master process is done serializing.
            dist.synchronize()

            torch.set_grad_enabled(False)
            model.eval()

            # Accumulate different val loss components according to the type of
            # pretraining model.
            val_loss_counter: Counter = Counter()

            for val_iteration, val_batch in enumerate(val_dataloader, start=1):
                for key in val_batch:
                    val_batch[key] = val_batch[key].to(device)
                output_dict = model(val_batch)

                val_loss_counter.update(output_dict["loss_components"])

            # Divide each loss component by number of val batches per GPU.
            val_loss_dict = {
                k: v / val_iteration
                for k, v in dict(val_loss_counter).items()
            }
            dist.average_across_processes(val_loss_dict)
            torch.set_grad_enabled(True)
            model.train()

            logger.info(f"Iteration: {iteration} [Val loss: {val_loss_dict}]")
            if dist.is_master_process():
                tensorboard_writer.add_scalars("val", val_loss_dict, iteration)
Ejemplo n.º 13
0
def main(_A: argparse.Namespace):
    apex = False
    is_cpu = False
    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
        is_cpu = True
    else:
        # Get the current device as set for current distributed process.
        # Check `launch` function in `virtex.utils.distributed` module.
        device = torch.cuda.current_device()

    # Create a config object (this will be immutable) and perform common setup
    # such as logging and setting up serialization directory.
    _C = Config(_A.config, _A.config_override)
    common_setup(_C, _A)

    # -------------------------------------------------------------------------
    #   INSTANTIATE DATALOADER, MODEL, OPTIMIZER
    # -------------------------------------------------------------------------
    tokenizer = TokenizerFactory.from_config(_C)
    train_dataset = PretrainingDatasetFactory.from_config(_C,
                                                          split="train",
                                                          csv=_A.train_csv)
    val_dataset = PretrainingDatasetFactory.from_config(_C,
                                                        split="val",
                                                        csv=_A.val_csv)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        #sampler= Sampler(train_dataset),
        sampler=DistributedSampler(train_dataset, shuffle=True),
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=True,
        collate_fn=train_dataset.collate_fn,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        # sampler = Sampler(val_dataset),
        sampler=DistributedSampler(val_dataset, shuffle=False),
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn,
    )

    model = PretrainingModelFactory.from_config(_C).to(device)
    optimizer = OptimizerFactory.from_config(_C, model.named_parameters())
    scheduler = LRSchedulerFactory.from_config(_C, optimizer)

    # -------------------------------------------------------------------------
    #   BEFORE TRAINING STARTS
    # -------------------------------------------------------------------------

    # Load checkpoint to resume training if specified.
    if _A.resume_from is not None:
        start_iteration = CheckpointManager(model=model,
                                            optimizer=optimizer,
                                            scheduler=scheduler).load(
                                                _A.resume_from)
    else:
        start_iteration = 0

    # Keep track of time per iteration and ETA.
    timer = Timer(
        start_from=start_iteration + 1,
        total_iterations=_C.OPTIM.NUM_ITERATIONS,
    )
    # Create an iterator from dataloader to sample batches perpetually.
    train_dataloader_iter = cycle(train_dataloader, device, start_iteration)

    if (not is_cpu):
        # Wrap model and optimizer using NVIDIA Apex for mixed precision training.
        # NOTE: Always do this before wrapping model with DistributedDataParallel.
        if apex:
            if _C.FP16_OPT > 0:
                from apex import amp

                model, optimizer = amp.initialize(model,
                                                  optimizer,
                                                  opt_level=f"O{_C.FP16_OPT}")

        # Wrap model in DDP if using more than one processes.
        if dist.get_world_size() > 1:
            dist.synchronize()
            model = nn.parallel.DistributedDataParallel(
                model, device_ids=[device], find_unused_parameters=True)

        # Create checkpoint manager and tensorboard writer (only in master process).
        if dist.is_master_process():
            checkpoint_manager = CheckpointManager(
                _A.serialization_dir,
                model=model,
                optimizer=optimizer,
                scheduler=scheduler,
            )
            tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir)
            tensorboard_writer.add_text("config", f"```\n{_C}\n```")

    # -------------------------------------------------------------------------
    #   TRAINING LOOP
    # -------------------------------------------------------------------------
    for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1):
        timer.tic()
        optimizer.zero_grad()

        batch_loss = torch.tensor(0.0, device=device)

        batch = next(train_dataloader_iter)
        output_dict = model(batch)

        loss = output_dict["loss"]
        batch_loss += loss.item()

        # Perform dynamic scaling of loss to adjust for mixed precision.
        if apex and _C.FP16_OPT > 0:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        # Clip norm of gradients before optimizer step.
        torch.nn.utils.clip_grad_norm_(
            amp.master_params(optimizer)
            if apex and _C.FP16_OPT > 0 else model.parameters(),
            _C.OPTIM.CLIP_GRAD_NORM,
        )
        optimizer.step()
        scheduler.step(iteration)
        timer.toc()

        # ---------------------------------------------------------------------
        #   TENSORBOARD LOGGING
        # ---------------------------------------------------------------------
        if iteration % _A.log_every == 0 and dist.is_master_process():
            logger.info(f"{timer.stats} | Loss: {batch_loss:.3f} | "
                        f"GPU mem: {dist.gpu_mem_usage()} MB")
            tensorboard_writer.add_scalars(
                "learning_rate",
                {
                    "visual": optimizer.param_groups[0]["lr"],
                    "common": optimizer.param_groups[-1]["lr"],
                },
                iteration,
            )
            tensorboard_writer.add_scalars("train",
                                           output_dict["loss_components"],
                                           iteration)

        # ---------------------------------------------------------------------
        #   VALIDATION
        # ---------------------------------------------------------------------
        if iteration % _A.checkpoint_every == 0:
            if dist.is_master_process():
                checkpoint_manager.step(iteration)

            torch.set_grad_enabled(False)
            model.eval()

            # Accumulate different val loss components according to the type of
            # pretraining model.
            val_loss_counter: Counter = Counter()

            for val_iteration, val_batch in enumerate(val_dataloader, start=1):
                for key in val_batch:
                    val_batch[key] = val_batch[key].to(device)
                output_dict = model(val_batch)

                val_loss_counter.update(output_dict["loss_components"])

            # Divide each loss component by number of val batches per GPU.
            val_loss_dict = {
                k: v / val_iteration
                for k, v in dict(val_loss_counter).items()
            }
            dist.average_across_processes(val_loss_dict)
            torch.set_grad_enabled(True)
            model.train()

        if iteration % _A.checkpoint_every == 0 and dist.is_master_process():
            logger.info(f"Iter: {iteration} | Val loss: {val_loss_dict}")
            tensorboard_writer.add_scalars("val", val_loss_dict, iteration)

        # All processes will wait till master process is done logging.
        dist.synchronize()
Ejemplo n.º 14
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
    else:
        # Get the current device (this will be zero here by default).
        device = torch.cuda.current_device()

    _C = Config(_A.config, _A.config_override)

    tokenizer = TokenizerFactory.from_config(_C)

    if _A.data_root is None:
        _A.data_root = os.path.join(_C.DATA.ROOT, "val2017")

    val_dataset = PretrainingDatasetFactory.from_config(_C, split='val', all_captions=True)
    val_dataset_no_image = PretrainingDatasetFactory.from_config(_C, split='val', all_captions=True, include_image=False)

    val_sampler = (
        DistributedSampler(val_dataset, shuffle=False)
        if _A.num_gpus_per_machine > 0
        else None
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=val_sampler,
        shuffle=False,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=val_dataset.collate_fn
    )

    val_dataloader_no_image = DataLoader(
        val_dataset_no_image,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        shuffle=False,
        drop_last=False,
        collate_fn=val_dataset.collate_fn
    )

    evaluator = CiderEvaluator(val_dataloader_no_image, prefix='val')

    # Initialize model from a checkpoint.
    model = PretrainingModelFactory.from_config(_C).to(device)
    ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path)
    model.eval()

    # Make a list of predictions to evaluate.
    predictions: List[Dict[str, Any]] = []

    if dist.is_master_process():
        pbar = tqdm(total=len(val_dataloader))
    for val_iteration, val_batch in enumerate(val_dataloader, start=1):
        val_batch = {'image_id': val_batch['image_id'].to(device),
                     'image': val_batch['image'].to(device)}
        with torch.no_grad():
            output_dict = model(val_batch)

        # Make a dictionary of predictions in COCO format.
        for image_id, caption in zip(
            val_batch["image_id"], output_dict["predictions"][:, 1:]
        ):
            predictions.append(
                {
                    # Convert image id to int if possible (mainly for COCO eval).
                    "image_id": image_id.item(),
                    "caption": tokenizer.decode(caption.tolist()),
                }
            )
        if dist.is_master_process():
            pbar.update(1)
    if dist.is_master_process():
        pbar.close()

    # Save predictions as a JSON file if specified.
    if _A.output is not None:
        os.makedirs(os.path.dirname(_A.output), exist_ok=True)
        json.dump(predictions, open(_A.output, "w"))
        logger.info(f"Saved predictions to {_A.output}")

    # Calculate CIDEr and SPICE metrics using ground truth COCO Captions. This
    # should be skipped when running inference on arbitrary images.
    if _A.calc_metrics:

        metrics = evaluator.evaluate(predictions)
        metrics = {k: torch.tensor(v, dtype=torch.float, device=device)
                   for k, v in metrics.items()}
        dist.average_across_processes(metrics)
        metrics = {k: v.item() for k, v in metrics.items()}
        if dist.is_master_process():
            logger.info(f"Iter: {ITERATION} | Metrics: {metrics}")
Ejemplo n.º 15
0
def main(_A: argparse.Namespace):

    if _A.num_gpus_per_machine == 0:
        # Set device as CPU if num_gpus_per_machine = 0.
        device = torch.device("cpu")
    else:
        # Get the current device (this will be zero here by default).
        device = torch.cuda.current_device()

    _C = Config(_A.config, _A.config_override)

    tokenizer = TokenizerFactory.from_config(_C)

    dataset = PretrainingDatasetFactory.from_config(_C, split='train')

    sampler = (
        DistributedSampler(dataset, shuffle=False)
        if _A.num_gpus_per_machine > 0
        else None
    )
    val_dataloader = DataLoader(
        dataset,
        batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(),
        sampler=sampler,
        shuffle=False,
        num_workers=_A.cpu_workers,
        pin_memory=True,
        drop_last=False,
        collate_fn=dataset.collate_fn
    )

    # Initialize model from a checkpoint.
    model = PretrainingModelFactory.from_config(_C).to(device)
    ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path)
    model.eval()
    torch.set_grad_enabled(False)
    model.sample_on()

    captions = dict()
    
    if dist.is_master_process():
        pbar = tqdm(total=len(val_dataloader))
    for val_iteration, val_batch in enumerate(val_dataloader, start=1):
        val_batch = {'image_id': val_batch['image_id'].to(device),
                     'image': val_batch['image'].to(device)}
        predictions = []
        for k in [1]:
            predictions.append(model(val_batch, sample_mode='beam', n_samples_per_image=k)['predictions'][:, 1:])
        max_length = max([p.shape[1] for p in predictions])
        predictions = [torch.cat((p, torch.zeros(p.shape[0], max_length - p.shape[1], device=device)), dim=1) for p in predictions]
        predictions = torch.stack(predictions, dim=1)

        # Make a dictionary of predictions in COCO format.
        for image_id, caption in zip(
            val_batch["image_id"], predictions
        ):
            captions[image_id.item()] = [tokenizer.decode(c.tolist()).strip() for c in caption]
        if dist.is_master_process():
            pbar.update(1)
    if dist.is_master_process():
        pbar.close()

    # Save predictions as a JSON file if specified.
    folder = os.path.dirname(_A.output)
    os.makedirs(folder, exist_ok=True)

    filename = os.path.basename(_A.output)
    filepath = os.path.join(folder, f'{dist.get_rank()}_{filename}')
    
    json.dump(captions, open(filepath, "w"))
    logger.info(f"Saved predictions to {filepath}")