Esempio n. 1
0
 def test_benchmark_model(self, filepath: str):
     logger.info(f"Loading {filepath}")
     cfg = SSLHydraConfig.from_configs(
         [filepath, "config.DISTRIBUTED.NUM_PROC_PER_NODE=1"])
     _, config = convert_to_attrdict(cfg.default_cfg)
     if not is_fsdp_model_config(config):
         build_model(config.MODEL, config.OPTIMIZER)
Esempio n. 2
0
def build_retrieval_model(cfg):
    """
    Builds the model on 1-gpu and initializes from the weight.
    """
    logging.info("Building model....")
    model = build_model(cfg.MODEL, cfg.OPTIMIZER)
    if PathManager.exists(cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE):
        init_weights_path = cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE
        logging.info(f"Initializing model from: {init_weights_path}")
        weights = torch.load(init_weights_path,
                             map_location=torch.device("cuda"))
        skip_layers = cfg.MODEL.WEIGHTS_INIT.get("SKIP_LAYERS", [])
        replace_prefix = cfg.MODEL.WEIGHTS_INIT.get("REMOVE_PREFIX", None)
        append_prefix = cfg.MODEL.WEIGHTS_INIT.get("APPEND_PREFIX", None)
        state_dict_key_name = cfg.MODEL.WEIGHTS_INIT.get(
            "STATE_DICT_KEY_NAME", None)

        init_model_from_consolidated_weights(
            cfg,
            model,
            weights,
            state_dict_key_name=state_dict_key_name,
            skip_layers=skip_layers,
            replace_prefix=replace_prefix,
            append_prefix=append_prefix,
        )
    else:
        # We only throw the warning if not weights file is provided. We want to
        # benchmark the random initialization model too and hence support that.
        logging.warning("Model is randomly initialized....")
    logging.info(f"Model is:\n {model}")
    return model
Esempio n. 3
0
    def _build_momentum_network(self, task: tasks.ClassyTask) -> None:
        """
        Create the teacher: it is an exponential moving average of the student.
        """
        logging.info("Building momentum encoder")

        # - same architecture but do not apply stochastic depth
        task.config["MODEL"]["TRUNK"]["VISION_TRANSFORMERS"][
            "DROP_PATH_RATE"] = 0
        task.loss.momentum_teacher = build_model(task.config["MODEL"],
                                                 task.config["OPTIMIZER"])
        task.loss.momentum_teacher = nn.SyncBatchNorm.convert_sync_batchnorm(
            task.loss.momentum_teacher)
        task.loss.momentum_teacher.to(task.device)

        if get_world_size() > 1:
            task.loss.momentum_teacher = init_distributed_data_parallel_model(
                task.loss.momentum_teacher)

        # Restore an hypothetical checkpoint
        if task.loss.checkpoint is not None:
            task.loss.load_state_dict(task.loss.checkpoint)
        # Initialize from the model
        else:
            task.loss.momentum_teacher.load_state_dict(task.model.state_dict())
Esempio n. 4
0
    def _build_momentum_network(self, task: tasks.ClassyTask) -> None:
        """
        Create the teacher: it is an exponential moving average of the student.
        """
        logging.info("Building momentum encoder")

        # Same architecture but do not apply stochastic depth
        # TODO: make drop_path_rate configurable for teacher
        task.config["MODEL"]["TRUNK"]["VISION_TRANSFORMERS"][
            "DROP_PATH_RATE"] = 0.0
        task.loss.momentum_teacher = build_model(task.config["MODEL"],
                                                 task.config["OPTIMIZER"])
        task.loss.momentum_teacher.to(task.device)

        # Restore an hypothetical checkpoint
        if task.loss.checkpoint is not None:
            task.loss.load_state_dict(task.loss.checkpoint)
        # Initialize from the model
        else:
            task_model = get_no_ddp_model(task.model)
            teacher_model = get_no_ddp_model(task.loss.momentum_teacher)
            teacher_model.load_state_dict(task_model.state_dict())

        # Setup SyncBN (useful for the XCiT)
        task.loss.momentum_teacher = nn.SyncBatchNorm.convert_sync_batchnorm(
            task.loss.momentum_teacher)
        task.loss.momentum_teacher = DistributedDataParallel(
            task.loss.momentum_teacher, device_ids=[task.device])

        # no gradients for teacher model
        for p in task.loss.momentum_teacher.parameters():
            p.requires_grad = False
Esempio n. 5
0
    def _build_moco_encoder(self, task: tasks.ClassyTask) -> None:
        """
        Create the model replica called the encoder. This will slowly track
        the main model.
        """
        # Create the encoder, which will slowly track the model
        logging.info(
            "Building MoCo encoder - rank %s %s", *get_machine_local_and_dist_rank()
        )

        # - same architecture
        task.loss.moco_encoder = build_model(
            task.config["MODEL"], task.config["OPTIMIZER"]
        )

        task.loss.moco_encoder.to(task.device)

        # Restore an hypothetical checkpoint, else initialize from the model
        if task.loss.checkpoint is not None:
            task.loss.load_state_dict(task.loss.checkpoint)
        else:
            for param_q, param_k in zip(
                task.base_model.parameters(), task.loss.moco_encoder.parameters()
            ):
                param_k.data.copy_(param_q.data)
                param_k.requires_grad = False
Esempio n. 6
0
    def _build_momentum_network(self, task: tasks.ClassyTask) -> None:
        """
        Create the model replica called the encoder. This will slowly track
        the main model.
        """
        logging.info("Building momentum encoder - rank %s %s",
                     *get_machine_local_and_dist_rank())

        # - same architecture
        task.loss.momentum_encoder = build_model(task.config["MODEL"],
                                                 task.config["OPTIMIZER"])
        task.loss.momentum_encoder = nn.SyncBatchNorm.convert_sync_batchnorm(
            task.loss.momentum_encoder)
        task.loss.momentum_encoder.to(
            torch.device("cuda" if task.use_gpu else "cpu"))

        # Initialize from the model
        if task.loss.checkpoint is None:
            for param_q, param_k in zip(
                    task.base_model.parameters(),
                    task.loss.momentum_encoder.parameters()):
                param_k.data.copy_(param_q.data)
            for buff_q, buff_k in zip(
                    task.base_model.named_buffers(),
                    task.loss.momentum_encoder.named_buffers(),
            ):
                if "running_" not in buff_k[0]:
                    continue
                buff_k[1].data.copy_(buff_q[1].data)
        task.loss.momentum_encoder = init_distributed_data_parallel_model(
            task.loss.momentum_encoder)

        # Restore an hypothetical checkpoint
        if task.loss.checkpoint is not None:
            task.loss.load_state_dict(task.loss.checkpoint)
Esempio n. 7
0
    def _build_model(self):
        """
        - Builds and returns model used for task. The returned model is not copied to
          gpu yet (if using gpu) and neither wrapped with DDP yet. This is done later
          by self.prepare()

        - We also convert the model BatchNorm layers to SyncBatchNorm if user
          has set the config option. We support PyTorch and Apex SyncBatchNorms
          both.

        - If the model is set to be in evaluation model and the full model must be frozen,
          we freeze the model.

        - If the model must be initialized from a checkpoint or user passed weights file
          we initialize the model from the checkpoint or the weights.
        """
        logging.info("Building model....")

        # Instantiate the raw model as specified
        model = build_model(self.config["MODEL"], self.config["OPTIMIZER"])

        # Convert the BatchNorm layers to SyncBatchNorm if needed
        # Both Apex and Pytorch SyncBatchNorms are GPU only
        if (self.config["MODEL"]["SYNC_BN_CONFIG"]["CONVERT_BN_TO_SYNC_BN"]
                and self.config["MACHINE"]["DEVICE"] == "gpu"):
            model = convert_sync_bn(self.config, model)

        # Enforce eval mode, no matter what the prior tranforms have done.
        # For instance apex converts batch-norms and sets `requires_grad` to True
        if self.config["MODEL"]["FEATURE_EVAL_SETTINGS"]["EVAL_MODE_ON"]:
            if self.config["MODEL"]["FEATURE_EVAL_SETTINGS"][
                    "FREEZE_TRUNK_ONLY"]:
                logging.info(
                    "config.MODEL.FEATURE_EVAL_SETTINGS.FREEZE_TRUNK_ONLY=True, "
                    "will freeze trunk...")
                model.freeze_trunk()
            elif self.config["MODEL"]["FEATURE_EVAL_SETTINGS"][
                    "FREEZE_TRUNK_AND_HEAD"]:
                logging.info(
                    "config.MODEL.FEATURE_EVAL_SETTINGS.FREEZE_TRUNK_AND_HEAD=True, will "
                    "freeze trunk and head...")
                model.freeze_head_and_trunk()

        # assert that if the user set the PARAMS_FILE, it must exist and be valid.
        if (self.checkpoint_path is None
                and self.config["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"]):
            assert PathManager.exists(
                self.config["MODEL"]["WEIGHTS_INIT"]
                ["PARAMS_FILE"]), "Specified PARAMS_FILE does NOT exist"
        # If we want to initialize the model in case of finetuning or evaluation,
        # we do it here. But we check that there is no checkpoint existing before
        # This is important in cases when the model training dies.
        if (self.checkpoint_path is None
                and self.config["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"]
                and PathManager.exists(
                    self.config["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"])):
            model = self._restore_model_weights(model)

        return model
Esempio n. 8
0
 def _create_ema_model(self):
     logging.info("Building the EMA model.")
     ema_model = build_model(self.config["MODEL"], self.config["OPTIMIZER"])
     self.ema_model = ModelEmaV2(
         ema_model,
         decay=self.config["HOOKS"]["EMA_MODEL"]["DECAY"],
         device=self.config["HOOKS"]["EMA_MODEL"]["EMA_DEVICE"],
     )
     self.ema_model.set(self.base_model)
Esempio n. 9
0
    def _pretraining_worker(
        gpu_id: int,
        with_fsdp: bool,
        with_activation_checkpointing: bool,
        with_larc: bool,
        sync_file: str,
        result_file: str,
    ):
        init_distributed_on_file(world_size=2,
                                 gpu_id=gpu_id,
                                 sync_file=sync_file)
        torch.manual_seed(0)
        torch.backends.cudnn.deterministic = True

        # Create the inputs
        batch = torch.randn(size=(8, 3, 224, 224)).cuda()
        target = torch.tensor(0.0).cuda()

        # Create a fake model based on SWAV blocks
        config = TestRegnetFSDP._create_pretraining_config(
            with_fsdp, with_activation_checkpointing, with_larc=with_larc)
        model = build_model(config["MODEL"], config["OPTIMIZER"])
        model = model.cuda()
        if with_fsdp:
            model = fsdp_wrapper(model, **config.MODEL.FSDP_CONFIG)
        else:
            model = DistributedDataParallel(model, device_ids=[gpu_id])
        criterion = SwAVLoss(loss_config=config["LOSS"]["swav_loss"])
        optimizer = build_optimizer(config["OPTIMIZER"])
        optimizer.set_param_groups(model.parameters())

        # Run a few iterations and collect the losses
        losses = []
        num_iterations = 5
        for iteration in range(num_iterations):
            out = model(batch)
            loss = criterion(out[0], target)
            if gpu_id == 0:
                losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            if iteration <= 2:
                for name, param in model.named_parameters():
                    if "prototypes" in name:
                        param.grad = None
            optimizer.step(where=float(iteration / num_iterations))

        # Store the losses in a file to compare several methods
        if gpu_id == 0:
            with open(result_file, "wb") as f:
                pickle.dump(losses, f)
Esempio n. 10
0
 def _compute_param_groups(finetune_config: AttrDict):
     """
     Take a configuration and compute the parameter groups
     for this configuration
     """
     optimizer_schedulers = build_optimizer_schedulers(
         finetune_config["OPTIMIZER"])
     base_model = build_model(finetune_config["MODEL"],
                              finetune_config["OPTIMIZER"])
     return get_optimizer_param_groups(
         model=base_model,
         model_config=finetune_config["MODEL"],
         optimizer_config=finetune_config["OPTIMIZER"],
         optimizer_schedulers=optimizer_schedulers,
     )
Esempio n. 11
0
    def test_vissl_implementation_support_multiple_resolutions(self):
        with torch.no_grad():
            config = self.vissl_swin_transformer_config()
            model = build_model(config["MODEL"], config["OPTIMIZER"]).cuda()

            x1 = torch.randn(size=(2, 3, 224, 224)).cuda()
            y1 = model.trunk(x1)[0]
            self.assertEqual(y1.shape, torch.Size([2, 768]))

            x2 = torch.randn(size=(2, 3, 96, 96)).cuda()
            y2 = model.trunk(x2)[0]
            self.assertEqual(y2.shape, torch.Size([2, 768]))

            x3 = torch.randn(size=(2, 3, 95, 95)).cuda()
            y3 = model.trunk(x3)[0]
            self.assertEqual(y3.shape, torch.Size([2, 768]))
Esempio n. 12
0
    def _distributed_worker(
        gpu_id: int, with_fsdp: bool, sync_file: str, result_file: str
    ):
        torch.cuda.set_device(gpu_id)
        dist.init_process_group(
            backend="nccl", init_method="file://" + sync_file, world_size=2, rank=gpu_id
        )

        # Create the inputs
        torch.manual_seed(0)
        torch.backends.cudnn.deterministic = True
        batch = torch.randn(size=(8, 3, 224, 224)).cuda()

        # Create a fake model based on SWAV blocks
        config = TestRegnetFSDP._create_config(with_fsdp)
        model = build_model(config["MODEL"], config["OPTIMIZER"])
        model = model.cuda()
        if with_fsdp:
            model = FSDP(model)
        else:
            model = DistributedDataParallel(model, device_ids=[gpu_id])
        criterion = SwAVLoss(loss_config=config["LOSS"]["swav_loss"])
        optimizer = optim.SGD(model.parameters(), lr=1e-2)

        # Run a few iterations and collect the losses
        losses = []
        for iteration in range(5):
            out = model(batch)
            loss = criterion(out[0], torch.tensor(0.0).cuda())
            if gpu_id == 0:
                losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            if iteration <= 2:
                for name, param in model.named_parameters():
                    if "prototypes" in name:
                        param.grad = None
            optimizer.step()

        # Store the losses in a file to compare several methods
        if gpu_id == 0:
            with open(result_file, "wb") as f:
                pickle.dump(losses, f)
Esempio n. 13
0
 def test_integration_test_model(self, filepath: str):
     logger.info(f"Loading {filepath}")
     cfg = SSLHydraConfig.from_configs([filepath])
     _, config = convert_to_attrdict(cfg.default_cfg)
     if not is_fsdp_model_config(config):
         build_model(config.MODEL, config.OPTIMIZER)
    def _worker(gpu_id: int, sync_file: str, world_size: int):
        torch.manual_seed(0)
        os.environ["RANK"] = str(gpu_id)
        init_distributed_on_file(world_size=world_size,
                                 gpu_id=gpu_id,
                                 sync_file=sync_file)
        torch.backends.cudnn.deterministic = True

        config = TestCheckpointConversion._create_fsdp_model_config(
            with_fsdp=True)
        model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id)
        model = fsdp_wrapper(model, **config.MODEL.FSDP_CONFIG)
        optimizer = optim.SGD(model.parameters(), lr=1e-4)

        # Fake inputs
        num_iterations = 5
        batch_size = 3
        torch.manual_seed(gpu_id)
        fake_inputs = torch.randn(size=(num_iterations, batch_size, 3, 96, 96))
        fake_targets = torch.randn(size=(num_iterations, batch_size))

        # Fake training loop
        criterion = nn.MSELoss()
        for iteration in range(num_iterations):
            fake_input = fake_inputs[iteration].cuda(gpu_id)
            fake_target = fake_targets[iteration].cuda(gpu_id)
            output1, output2 = model(fake_input)[0]
            loss = criterion(output1.sum(axis=-1), fake_target) + criterion(
                output2.sum(axis=-1), fake_target)
            if gpu_id == 0:
                print(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Save a bunch of checkpoint, one by shard
        checkpoint_writer = CheckpointWriter(
            checkpoint_folder=".",
            is_final_train_phase=True,
            mode="iteration",
            mode_num=0,
            backend="disk",
        )
        content = {
            "classy_state_dict": {
                "base_model": {
                    "model": {
                        "trunk": model.trunk.local_state_dict()
                    },
                    "meta": {
                        "trunk": model.trunk.local_metadata_dict()
                    },
                }
            }
        }
        checkpoint_writer.save_sharded_checkpoint(content,
                                                  shard_rank=gpu_id,
                                                  world_size=world_size)
        dist.barrier()
        print(os.listdir("."))

        # Convert the checkpoint to consolidated and sliced checkpoints
        if gpu_id == 0:
            CheckpointFormatConverter.sharded_to_consolidated_checkpoint(
                "checkpoint.torch", "checkpoint_conso.torch")
            CheckpointFormatConverter.sharded_to_sliced_checkpoint(
                "checkpoint.torch", "checkpoint_sliced.torch")
        dist.barrier()
        print(os.listdir("."))

        # Now create models initialized from the previous checkpoint and compare them
        fake_test_input = torch.randn(size=(1, 3, 96, 96)).cuda(gpu_id)

        shard_cp = CheckpointLoader.load_and_broadcast_init_weights(
            "checkpoint.torch", device=torch.device("cpu"))
        shard_model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id)
        shard_model = fsdp_wrapper(shard_model, **config.MODEL.FSDP_CONFIG)
        shard_model.init_model_from_weights_params_file(config, shard_cp)

        conso_cp = CheckpointLoader.load_and_broadcast_init_weights(
            "checkpoint_conso.torch", device=torch.device("cpu"))
        conso_model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id)
        conso_model = fsdp_wrapper(conso_model, **config.MODEL.FSDP_CONFIG)
        conso_model.init_model_from_weights_params_file(config, conso_cp)

        slice_cp = CheckpointLoader.load_and_broadcast_init_weights(
            "checkpoint_sliced.torch", device=torch.device("cpu"))
        slice_model = build_model(config.MODEL, config.OPTIMIZER).cuda(gpu_id)
        slice_model = fsdp_wrapper(slice_model, **config.MODEL.FSDP_CONFIG)
        slice_model.init_model_from_weights_params_file(config, slice_cp)

        # Verifying that the models are equivalent
        if gpu_id == 0:
            slice_state_dict = slice_model.local_state_dict()
            conso_state_dict = conso_model.local_state_dict()
            assert set(slice_state_dict.keys()) == set(conso_state_dict.keys())
            for k in slice_state_dict.keys():
                slice_val = slice_state_dict[k]
                conso_val = conso_state_dict[k]
                assert torch.allclose(
                    slice_val, conso_val
                ), f"Difference for key {k}: {slice_val} VS {conso_val}"
        dist.barrier()

        with torch.no_grad():
            ref_out = model.trunk(fake_test_input)[0]
            shard_out = shard_model.trunk(fake_test_input)[0]
            conso_out = conso_model.trunk(fake_test_input)[0]
            slice_out = slice_model.trunk(fake_test_input)[0]
            assert torch.allclose(
                ref_out, shard_out), f"{ref_out.sum()} vs {shard_out.sum()}"
            assert torch.allclose(
                ref_out, conso_out), f"{ref_out.sum()} vs {conso_out.sum()}"
            assert torch.allclose(
                ref_out, slice_out), f"{ref_out.sum()} vs {slice_out.sum()}"
Esempio n. 15
0
 def test_pretrain_model(self, filepath):
     logger.info(f"Loading {filepath}")
     cfg = SSLHydraConfig.from_configs([filepath])
     _, config = convert_to_attrdict(cfg.default_cfg)
     build_model(config.MODEL, config.OPTIMIZER)
Esempio n. 16
0
 def build_eval_model(config, seed=0):
     torch.manual_seed(0)
     torch.cuda.manual_seed(0)
     model = build_model(config["MODEL"], config["OPTIMIZER"]).cuda()
     model.eval()
     return model