def extract_archive_file(archive_fn: str, im_dir: str): if not PathManager.exists(im_dir) or not PathManager.ls(im_dir): # Dataset is not deployed. Deploy it. archive_fns = archive_fn # A dataset may be composed of several tgz files, or only one. # If one, make it into a list to make the code later more general if not isinstance(archive_fns, list): archive_fns = [archive_fns] logger.info("Extracting datasets {} to local machine at {}".format( archive_fns, im_dir)) if not PathManager.exists(im_dir): PathManager.mkdirs(im_dir) for archive_fn in archive_fns: # Extract the tgz file directly into the target directory, # without precopy. # Note that the tgz file contains a root directory that # we do not want, hence the strip-components=1 commandUnpack = ("tar -mxzf {src_file} -C {tgt_dir} " "--strip-components=1").format( src_file=archive_fn, tgt_dir=im_dir) assert not subprocess.call( shlex.split(commandUnpack)), "Failed to unpack" logger.info("Extracted {}".format(archive_fn))
def convert_to_coco_json(dataset_name, output_file, allow_cached=True): """ Converts dataset into COCO format and saves it to a json file. dataset_name must be registered in DatasetCatalog and in detectron2's standard format. Args: dataset_name: reference from the config file to the catalogs must be registered in DatasetCatalog and in detectron2's standard format output_file: path of json file that will be saved to allow_cached: if json file is already present then skip conversion """ # TODO: The dataset or the conversion script *may* change, # a checksum would be useful for validating the cached data PathManager.mkdirs(os.path.dirname(output_file)) with file_lock(output_file): if PathManager.exists(output_file) and allow_cached: logger.warning( f"Using previously cached COCO format annotations at '{output_file}'. " "You need to clear the cache file if your dataset has been modified." ) else: logger.info(f"Converting annotations of dataset '{dataset_name}' to COCO format ...)") coco_dict = convert_to_coco_dict(dataset_name) logger.info(f"Caching COCO format annotations at '{output_file}' ...") tmp_file = output_file + ".tmp" with PathManager.open(tmp_file, "w") as f: json.dump(coco_dict, f) shutil.move(tmp_file, output_file)
def save_protobuf(self, output_dir): """ Save the model as caffe2's protobuf format. It saves the following files: * "model.pb": definition of the graph. Can be visualized with tools like `netron <https://github.com/lutzroeder/netron>`_. * "model_init.pb": model parameters * "model.pbtxt": human-readable definition of the graph. Not needed for deployment. Args: output_dir (str): the output directory to save protobuf files. """ logger = logging.getLogger(__name__) logger.info("Saving model to {} ...".format(output_dir)) if not PathManager.exists(output_dir): PathManager.mkdirs(output_dir) with PathManager.open(os.path.join(output_dir, "model.pb"), "wb") as f: f.write(self._predict_net.SerializeToString()) with PathManager.open(os.path.join(output_dir, "model.pbtxt"), "w") as f: f.write(str(self._predict_net)) with PathManager.open(os.path.join(output_dir, "model_init.pb"), "wb") as f: f.write(self._init_net.SerializeToString())
def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"): self.gt_json = ann_file self.gt_folder = ann_folder if utils.is_main_process(): if not PathManager.exists(output_dir): PathManager.mkdir(output_dir) self.output_dir = output_dir self.predictions = []
def _get_test_image(self): try: file_name = DatasetCatalog.get("coco_2017_train")[0]["file_name"] if not PathManager.exists(file_name): raise FileNotFoundError() except IOError: # for public CI to run file_name = "http://images.cocodataset.org/train2017/000000000009.jpg" with PathManager.open(file_name, "rb") as f: buf = f.read() img = cv2.imdecode(np.frombuffer(buf, dtype=np.uint8), cv2.IMREAD_COLOR) assert img is not None, file_name return torch.from_numpy(img.transpose(2, 0, 1))
def test_read_sem_seg(self): cityscapes_dir = MetadataCatalog.get("cityscapes_fine_sem_seg_val").gt_dir sem_seg_gt_path = os.path.join( cityscapes_dir, "frankfurt", "frankfurt_000001_083852_gtFine_labelIds.png" ) if not PathManager.exists(sem_seg_gt_path): raise unittest.SkipTest( "Semantic segmentation ground truth {} not found.".format(sem_seg_gt_path) ) sem_seg = detection_utils.read_image(sem_seg_gt_path, "L") self.assertEqual(sem_seg.ndim, 3) self.assertEqual(sem_seg.shape[2], 1) self.assertEqual(sem_seg.dtype, np.uint8) self.assertEqual(sem_seg.max(), 32) self.assertEqual(sem_seg.min(), 1)
def __init__(self, root, split, transforms=None): super(ADE20KParsing, self).__init__(root) # assert exists and prepare dataset automatically assert PathManager.exists(root), "Please setup the dataset" self.images, self.masks = _get_ade20k_pairs(root, split) assert len(self.images) == len(self.masks) if len(self.images) == 0: raise ( RuntimeError( "Found 0 images in subfolders of: \ " + root + "\n" ) ) self._transforms = transforms
def save_protobuf(self, output_dir): """ Save the model as caffe2's protobuf format. Args: output_dir (str): the output directory to save protobuf files. """ logger = logging.getLogger(__name__) logger.info("Saving model to {} ...".format(output_dir)) if not PathManager.exists(output_dir): PathManager.mkdirs(output_dir) with PathManager.open(os.path.join(output_dir, "model.pb"), "wb") as f: f.write(self._predict_net.SerializeToString()) with PathManager.open(os.path.join(output_dir, "model.pbtxt"), "w") as f: f.write(str(self._predict_net)) with PathManager.open(os.path.join(output_dir, "model_init.pb"), "wb") as f: f.write(self._init_net.SerializeToString())
def get_sample_coco_image(tensor=True): """ Args: tensor (bool): if True, returns 3xHxW tensor. else, returns a HxWx3 numpy array. Returns: an image, in BGR color. """ try: file_name = DatasetCatalog.get("coco_2017_val_100")[0]["file_name"] if not PathManager.exists(file_name): raise FileNotFoundError() except IOError: # for public CI to run file_name = "http://images.cocodataset.org/train2017/000000000009.jpg" ret = read_image(file_name, format="BGR") if tensor: ret = torch.from_numpy(np.ascontiguousarray(ret.transpose(2, 0, 1))) return ret
def main( cfg: CfgNode, output_dir: str, task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask, eval_only: bool = False, num_machines: int = 1, num_processes: int = 1, ) -> TrainOutput: """Main function for launching a training with lightning trainer Args: cfg: D2go config node num_machines: Number of nodes used for distributed training num_processes: Number of processes on each node. eval_only: True if run evaluation only. """ # FIXME: make comm.get_world_size() work properly. setup_after_launch(cfg, output_dir, _scale_world_size=False) auto_scale_world_size(cfg, new_world_size=num_machines * num_processes) task = task_cls.from_config(cfg, eval_only) trainer_params = get_trainer_params(cfg, num_machines, num_processes) last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt") if PathManager.exists(last_checkpoint): # resume training from checkpoint trainer_params["resume_from_checkpoint"] = last_checkpoint logger.info(f"Resuming training from checkpoint: {last_checkpoint}.") trainer = pl.Trainer(**trainer_params) model_configs = None if eval_only: do_test(trainer, task) else: model_configs = do_train(cfg, trainer, task) return TrainOutput( output_dir=cfg.OUTPUT_DIR, tensorboard_log_dir=trainer_params["logger"].log_dir, accuracy=task.eval_res, model_configs=model_configs, )
def main( cfg: CfgNode, output_dir: Optional[str] = None, task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask, eval_only: bool = False, num_machines: int = 1, num_gpus: int = 0, num_processes: int = 1, ) -> TrainOutput: """Main function for launching a training with lightning trainer Args: cfg: D2go config node num_machines: Number of nodes used for distributed training num_gpus: Number of GPUs to train on each node num_processes: Number of processes on each node. NOTE: Automatically set to the number of GPUs when using DDP. Set a value greater than 1 to mimic distributed training on CPUs. eval_only: True if run evaluation only. """ assert (num_processes == 1 or num_gpus == 0), "Only set num_processes > 1 when training on CPUs" maybe_override_output_dir(cfg, output_dir) task = task_cls.from_config(cfg, eval_only) tb_logger = TensorBoardLogger(save_dir=cfg.OUTPUT_DIR) trainer_params = { # training loop is bounded by max steps, use a large max_epochs to make # sure max_steps is met first "max_epochs": 10**8, "max_steps": cfg.SOLVER.MAX_ITER, "val_check_interval": cfg.TEST.EVAL_PERIOD if cfg.TEST.EVAL_PERIOD > 0 else cfg.SOLVER.MAX_ITER, "num_nodes": num_machines, "gpus": num_gpus, "num_processes": num_processes, "accelerator": get_accelerator(cfg.MODEL.DEVICE), "callbacks": _get_trainer_callbacks(cfg), "logger": tb_logger, "num_sanity_val_steps": 0, "progress_bar_refresh_rate": 10, } last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt") if PathManager.exists(last_checkpoint): # resume training from checkpoint trainer_params["resume_from_checkpoint"] = last_checkpoint logger.info(f"Resuming training from checkpoint: {last_checkpoint}.") trainer = pl.Trainer(**trainer_params) model_configs = None if eval_only: do_test(trainer, task) else: model_configs = do_train(cfg, trainer, task) return TrainOutput( output_dir=cfg.OUTPUT_DIR, tensorboard_log_dir=tb_logger.log_dir, accuracy=task.eval_res, model_configs=model_configs, )
def fetch_checkpoints_till_final(checkpoint_dir): """ A generator that yields all checkpoint paths under the given directory, it'll keep refreshing until model_final is found. """ MIN_SLEEP_INTERVAL = 1.0 # in seconds MAX_SLEEP_INTERVAL = 60.0 # in seconds sleep_interval = MIN_SLEEP_INTERVAL finished_checkpoints = set() def _add_and_log(path): finished_checkpoints.add(path) logger.info("Found checkpoint: {}".format(path)) return path def _log_and_sleep(sleep_interval): logger.info( "Sleep {} seconds while waiting for model_final.pth".format(sleep_interval) ) time.sleep(sleep_interval) return min(sleep_interval * 2, MAX_SLEEP_INTERVAL) def _get_lightning_checkpoints(path: str): return [ os.path.join(path, x) for x in PathManager.ls(path) if x.endswith(ModelCheckpoint.FILE_EXTENSION) and not x.startswith(ModelCheckpoint.CHECKPOINT_NAME_LAST) ] while True: if not PathManager.exists(checkpoint_dir): sleep_interval = _log_and_sleep(sleep_interval) continue checkpoint_paths = DetectionCheckpointer( None, save_dir=checkpoint_dir ).get_all_checkpoint_files() checkpoint_paths.extend(_get_lightning_checkpoints(checkpoint_dir)) final_model_path = None periodic_checkpoints = [] for path in sorted(checkpoint_paths): if path.endswith("model_final.pth") or path.endswith("model_final.ckpt"): final_model_path = path continue if path.endswith(ModelCheckpoint.FILE_EXTENSION): # Lightning checkpoint model_iter = int( re.findall( r"(?<=step=)\d+(?={})".format(ModelCheckpoint.FILE_EXTENSION), path, )[0] ) else: model_iter = int(re.findall(r"(?<=model_)\d+(?=\.pth)", path)[0]) periodic_checkpoints.append((path, model_iter)) periodic_checkpoints = [ pc for pc in periodic_checkpoints if pc[0] not in finished_checkpoints ] periodic_checkpoints = sorted(periodic_checkpoints, key=lambda x: x[1]) for pc in periodic_checkpoints: yield _add_and_log(pc[0]) sleep_interval = MIN_SLEEP_INTERVAL if final_model_path is None: sleep_interval = _log_and_sleep(sleep_interval) else: yield _add_and_log(final_model_path) break