def save(self, estimator, epoch): """Save estimator to the log_dir. Args: estimator (datasetinsights.estimators.Estimator): datasetinsights estimator object. epoch (int): Epoch number. """ if self.distributed and not is_master(): return self._writer.save(estimator=estimator, epoch=epoch)
def __init__( self, dirname, prefix, *, suffix=DEFAULT_SUFFIX, create_dir=True ): self.dirname = dirname self.prefix = prefix self.suffix = suffix self.is_master = is_master() if create_dir: if not os.path.exists(dirname): os.makedirs(dirname) if not os.path.exists(dirname): raise ValueError(f"Directory path '{dirname}' is not found.")
def __init__( self, *, config, writer, kfp_writer, checkpointer, box_score_thresh=0.05, no_cuda=None, checkpoint_file=None, **kwargs, ): """initiate estimator.""" logger.info(f"initializing faster rcnn") self.config = config self._init_distributed_mode() self.no_cuda = no_cuda self._init_device() self.writer = SummaryWriter(writer.logdir, write_to_disk=is_master()) self.kfp_writer = kfp_writer checkpointer.distributed = self.distributed self.checkpointer = checkpointer model_name = f"fasterrcnn_{self.config.backbone}_fpn" self.model = torchvision.models.detection.__dict__[model_name]( num_classes=config.num_classes, pretrained_backbone=config.pretrained_backbone, pretrained=config.pretrained, box_detections_per_img=MAX_BOXES_PER_IMAGE, box_score_thresh=box_score_thresh, ) self.model_without_ddp = self.model self.sync_metrics = config.get("synchronize_metrics", True) self.metrics = {} for metric_key, metric in config.metrics.items(): self.metrics[metric_key] = EvaluationMetric.create(metric.name) self.model.to(self.device) if self.distributed: self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[self.gpu]) self.model_without_ddp = self.model.module if checkpoint_file: self.checkpointer.load(self, checkpoint_file)
def load(self, estimator, path): """Loads estimator from given path. Path can be either a local path or GCS path or HTTP url. Args: estimator (datasetinsights.estimators.Estimator): datasetinsights estimator object path (str): path of estimator """ if self.distributed and not is_master(): return load_method = self._get_loader_from_path(path) load_method(estimator, path)
def _init_distributed_mode(self): if "RANK" in os.environ and "WORLD_SIZE" in os.environ: logger.info(f"found RANK and WORLD_SIZE in environment") self.rank = int(os.environ["RANK"]) self.world_size = int(os.environ["WORLD_SIZE"]) self.gpu = int(os.environ["LOCAL_RANK"]) elif "SLURM_PROCID" in os.environ: logger.info(f"found 'SLURM_PROCID' in environment") self.rank = int(os.environ["SLURM_PROCID"]) self.gpu = self.rank % torch.cuda.device_count() else: self.gpu = 0 self.rank = 0 logger.info("Not using distributed mode") self.distributed = False return device_count = torch.cuda.device_count() logger.info(f"device count: {torch.cuda.device_count()}") logger.info(f"world size: {self.world_size}") logger.info(f"gpu: {self.gpu}") logger.info(f"local rank {self.rank}") if device_count == 0: logger.info("No cuda devices found, will not parallelize") self.distributed = False return if not is_master(): logging.disable(logging.ERROR) self.distributed = True torch.cuda.set_device(self.gpu) torch.distributed.init_process_group( backend="nccl", init_method="env://", world_size=self.world_size, rank=self.rank, ) torch.distributed.barrier()
def create_checkpointer(*, logdir, config): """ Initialize the correct estimator checkpointer Args: logdir: filepath to where to save/load local copy of estimator config: Returns (EstimatorCheckpoint): the correct estimator checkpoint for the config. The Estimator Checkpoint is responsible for saving and loading the estimator. """ if logdir.startswith(GCS_BASE_STR): checkpointer = GCSEstimatorCheckpoint(logdir, config.estimator) elif checkpoint_file_on_gcs(config): logdir = f"{GCS_BASE_STR}{const.GCS_BUCKET}/runs/{str(logdir)}" checkpointer = GCSEstimatorCheckpoint(logdir, config.estimator) else: checkpointer = EstimatorCheckpoint(logdir, config.estimator) if config.system.distributed: checkpointer = DistributedEstimatorCheckpoint( is_master=is_master(), estimator_checkpoint=checkpointer ) return checkpointer