Exemple #1
0
    def build_dataloaders(
        self, pin_memory: bool, current_train_phase_idx=0
    ) -> torch.utils.data.DataLoader:
        """
        Build PyTorch dataloaders for all the available_splits. By default, we construct the
        standard PyTorch Dataloader and allow setting all dataloader options.
        """
        # Gives sampler same seed for entire distributed group as per pytorch documentation.
        sampler_seed = self.config["SEED_VALUE"]

        loaders = {
            split.lower(): build_dataloader(
                dataset=self.datasets[split.lower()],
                dataset_config=self.config["DATA"][split],
                num_dataloader_workers=self.config.DATA.NUM_DATALOADER_WORKERS,
                pin_memory=pin_memory,
                multi_processing_method=self.config.MULTI_PROCESSING_METHOD,
                device=self.device,
                sampler_seed=sampler_seed,
                split=split.lower(),
            )
            for split in self.available_splits
        }

        return loaders
Exemple #2
0
def benchmark_data(cfg: AttrDict, split: str = "train"):
    split = split.upper()
    total_images = MAX_ITERS * cfg["DATA"][split]["BATCHSIZE_PER_REPLICA"]
    timer = Timer()
    dataset = build_dataset(cfg=cfg, split=split)

    try:
        device = torch.device("cuda" if cfg.MACHINE.DEVICE == "gpu" else "cpu")
    except AttributeError:
        device = torch.device("cuda")

    dataloader = build_dataloader(
        dataset=dataset,
        dataset_config=cfg["DATA"][split],
        num_dataloader_workers=cfg.DATA.NUM_DATALOADER_WORKERS,
        pin_memory=False,
        multi_processing_method=cfg.MULTI_PROCESSING_METHOD,
        device=device,
        sampler_seed=cfg.SEED_VALUE,
        split=split,
    )

    # Fairstore data sampler would require setting the start iter before it can start.
    if hasattr(dataloader.sampler, "set_start_iter"):
        dataloader.sampler.set_start_iter(0)

    # initial warmup measured as warmup time
    timer.reset()
    data_iterator = iter(dataloader)
    for i in range(10):  # warmup
        next(data_iterator)
        if i == 0:
            # the total number of seconds since the start/reset of the timer
            warmup_time = timer.seconds()
    logging.info(f"Warmup time {WARMUP_ITERS} batches: {warmup_time} seconds")

    # measure the number of images per sec in 1000 iterations.
    timer = Timer()
    for _ in tqdm.trange(MAX_ITERS):
        next(data_iterator)
    time_elapsed = timer.seconds()
    logging.info(
        f"iters: {MAX_ITERS}; images: {total_images}; time: {time_elapsed} seconds; "
        f"images/sec: {round(float(total_images / time_elapsed), 4)}; "
        f"ms/img: {round(float(1000 * time_elapsed / total_images), 4)} ")

    # run benchmark for a few more rounds to catch fluctuations
    for round_idx in range(BENCHMARK_ROUNDS):
        timer = Timer()
        for _ in tqdm.trange(MAX_ITERS):
            next(data_iterator)
        time_elapsed = timer.seconds()
        logging.info(
            f"round: {round_idx}: iters: {MAX_ITERS}; images: {total_images}; "
            f"time: {time_elapsed} seconds; "
            f"images/sec: {round(float(total_images / time_elapsed), 4)}; "
            f"ms/img: {round(float(1000 * time_elapsed / total_images), 4)} ")
    del data_iterator
    del dataloader
Exemple #3
0
    def recreate_data_iterator(
        self,
        phase_type: str,
        epoch: int,
        compute_start_iter: bool,
        train_phase_idx: int,
    ):
        """
        Recreate data iterator (including multiprocessing workers) and destroy the
        previous iterators.

        This is called when we load a new checkpoint or when phase changes during
        the training (one epoch to the next).
        DataSampler may need to be informed on those events to update the
        epoch and start_iteration so that the data is deterministically shuffled,
        so we call them here.
        """
        start_iter = 0
        if compute_start_iter:
            start_iter = self._compute_start_iter_from_checkpoint(phase_type)

        self.set_epoch(phase_type, epoch, start_iter, train_phase_idx)

        # Gives sampler same seed for entire distributed group as per pytorch documentation.
        sampler_seed = self.config["SEED_VALUE"]
        dataset = self.datasets[phase_type]

        # For OSS, this will always return false.
        # Otherwise, we will rebuild the dataloader after every phase.
        if dataset.rebuild_dataloader():
            dataloader = build_dataloader(
                dataset=dataset,
                dataset_config=self.config.DATA[phase_type.upper()],
                num_dataloader_workers=self.config.DATA.NUM_DATALOADER_WORKERS,
                pin_memory=self.config.DATA.PIN_MEMORY,
                multi_processing_method=self.config.MULTI_PROCESSING_METHOD,
                device=self.device,
                sampler_seed=sampler_seed,
                split=phase_type,
            )

            # delete old dataloader and reset it.
            del self.dataloaders[phase_type]
            gc.collect()
            self.dataloaders[phase_type] = dataloader

        # delete old dataiterator and reset it.
        del self.data_iterator
        gc.collect()
        self.data_iterator = iter(self.dataloaders[phase_type])