Ejemplo n.º 1
0
    def save(self, estimator, epoch):
        """Save estimator to the log_dir.

        Args:
            estimator (datasetinsights.estimators.Estimator):
            datasetinsights estimator object.
            epoch (int): Epoch number.

        """
        if self.distributed and not is_master():
            return
        self._writer.save(estimator=estimator, epoch=epoch)
Ejemplo n.º 2
0
    def __init__(
        self, dirname, prefix, *, suffix=DEFAULT_SUFFIX, create_dir=True
    ):
        self.dirname = dirname
        self.prefix = prefix
        self.suffix = suffix
        self.is_master = is_master()
        if create_dir:
            if not os.path.exists(dirname):
                os.makedirs(dirname)

        if not os.path.exists(dirname):
            raise ValueError(f"Directory path '{dirname}' is not found.")
Ejemplo n.º 3
0
    def __init__(
        self,
        *,
        config,
        writer,
        kfp_writer,
        checkpointer,
        box_score_thresh=0.05,
        no_cuda=None,
        checkpoint_file=None,
        **kwargs,
    ):
        """initiate estimator."""

        logger.info(f"initializing faster rcnn")
        self.config = config

        self._init_distributed_mode()
        self.no_cuda = no_cuda
        self._init_device()
        self.writer = SummaryWriter(writer.logdir, write_to_disk=is_master())

        self.kfp_writer = kfp_writer
        checkpointer.distributed = self.distributed
        self.checkpointer = checkpointer

        model_name = f"fasterrcnn_{self.config.backbone}_fpn"
        self.model = torchvision.models.detection.__dict__[model_name](
            num_classes=config.num_classes,
            pretrained_backbone=config.pretrained_backbone,
            pretrained=config.pretrained,
            box_detections_per_img=MAX_BOXES_PER_IMAGE,
            box_score_thresh=box_score_thresh,
        )
        self.model_without_ddp = self.model
        self.sync_metrics = config.get("synchronize_metrics", True)
        self.metrics = {}
        for metric_key, metric in config.metrics.items():
            self.metrics[metric_key] = EvaluationMetric.create(metric.name)
        self.model.to(self.device)

        if self.distributed:
            self.model = torch.nn.parallel.DistributedDataParallel(
                self.model, device_ids=[self.gpu])
            self.model_without_ddp = self.model.module

        if checkpoint_file:
            self.checkpointer.load(self, checkpoint_file)
Ejemplo n.º 4
0
    def load(self, estimator, path):
        """Loads estimator from given path.

        Path can be either a local path or GCS path or HTTP url.

        Args:
            estimator (datasetinsights.estimators.Estimator):
            datasetinsights estimator object
            path (str): path of estimator

        """
        if self.distributed and not is_master():
            return

        load_method = self._get_loader_from_path(path)
        load_method(estimator, path)
Ejemplo n.º 5
0
    def _init_distributed_mode(self):
        if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
            logger.info(f"found RANK and WORLD_SIZE in environment")
            self.rank = int(os.environ["RANK"])
            self.world_size = int(os.environ["WORLD_SIZE"])
            self.gpu = int(os.environ["LOCAL_RANK"])
        elif "SLURM_PROCID" in os.environ:
            logger.info(f"found 'SLURM_PROCID' in environment")
            self.rank = int(os.environ["SLURM_PROCID"])
            self.gpu = self.rank % torch.cuda.device_count()
        else:
            self.gpu = 0
            self.rank = 0
            logger.info("Not using distributed mode")
            self.distributed = False
            return

        device_count = torch.cuda.device_count()
        logger.info(f"device count: {torch.cuda.device_count()}")
        logger.info(f"world size: {self.world_size}")
        logger.info(f"gpu: {self.gpu}")
        logger.info(f"local rank {self.rank}")
        if device_count == 0:
            logger.info("No cuda devices found, will not parallelize")
            self.distributed = False
            return
        if not is_master():
            logging.disable(logging.ERROR)
        self.distributed = True
        torch.cuda.set_device(self.gpu)

        torch.distributed.init_process_group(
            backend="nccl",
            init_method="env://",
            world_size=self.world_size,
            rank=self.rank,
        )
        torch.distributed.barrier()
Ejemplo n.º 6
0
def create_checkpointer(*, logdir, config):
    """
    Initialize the correct estimator checkpointer
    Args:
        logdir: filepath to where to save/load local copy of estimator
        config:

    Returns (EstimatorCheckpoint): the correct estimator checkpoint for the
    config. The Estimator Checkpoint is responsible for saving and loading
    the estimator.

    """
    if logdir.startswith(GCS_BASE_STR):
        checkpointer = GCSEstimatorCheckpoint(logdir, config.estimator)
    elif checkpoint_file_on_gcs(config):
        logdir = f"{GCS_BASE_STR}{const.GCS_BUCKET}/runs/{str(logdir)}"
        checkpointer = GCSEstimatorCheckpoint(logdir, config.estimator)
    else:
        checkpointer = EstimatorCheckpoint(logdir, config.estimator)
    if config.system.distributed:
        checkpointer = DistributedEstimatorCheckpoint(
            is_master=is_master(), estimator_checkpoint=checkpointer
        )
    return checkpointer