def _distributed_worker(local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args): assert torch.cuda.is_available( ), "cuda is not available. Please check your installation." global_rank = machine_rank * num_gpus_per_machine + local_rank try: dist.init_process_group(backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank) except Exception as e: logger = setup_logger(__name__) logger.error("Process group URL: {}".format(dist_url)) raise e dist.synchronize() assert num_gpus_per_machine <= torch.cuda.device_count() torch.cuda.set_device(local_rank) # Setup the local process group (which contains ranks within the same machine) assert dist._LOCAL_PROCESS_GROUP is None num_machines = world_size // num_gpus_per_machine for i in range(num_machines): ranks_on_i = list( range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)) pg = dist.new_group(ranks_on_i) if i == machine_rank: dist._LOCAL_PROCESS_GROUP = pg main_func(*args)
def launch(main_func, num_gpus_per_machine, num_machines=1, machine_rank=0, dist_ip=None, dist_port=None, args=()): world_size = num_machines * num_gpus_per_machine if world_size == 1: main_func(*args) else: if machine_rank == 0: dist_ip = "127.0.0.1" if dist_port is None: dist_port = _find_free_port() else: assert num_machines > 1, "At least 2 machines is needed" assert dist_ip is not None, "Set main machine ip address" assert dist_port is not None, "Set dist port number which is same with main machine port" dist_url = f"tcp://{dist_ip}:{dist_port}" logger = setup_logger(__name__) logger.info(f"pytorch distribute url : {dist_url}") mp.spawn( _distributed_worker, nprocs=num_gpus_per_machine, args=(main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args), daemon=False, )
def main(args): _logger = setup_logger(__name__) cfg = get_cfg(args.config_file) if cfg.SEED < 0 : cfg.SEED = dist.shared_random_seed() _logger.debug(f'Config File : \n{cfg}')
def __init__(self, cfg): self._logger = setup_logger(__name__, all_rank=True) if dist.is_main_process(): self._logger.debug(f'Config File : \n{cfg}') if cfg.VISUALIZE_DIR and not os.path.isdir(cfg.VISUALIZE_DIR) : os.makedirs(cfg.VISUALIZE_DIR) self.visualize_dir = cfg.VISUALIZE_DIR dist.synchronize() self.test_loader = build_test_loader(cfg) self.model = build_model(cfg) self.model.eval() if dist.is_main_process(): self._logger.debug(f"Model Structure\n{self.model}") if dist.get_world_size() > 1: self.model = DistributedDataParallel(self.model, device_ids=[dist.get_local_rank()], broadcast_buffers=False) self.checkpointer = Checkpointer( self.model, cfg.OUTPUT_DIR, ) self.checkpointer.load(cfg.WEIGHTS) self.meta_data = MetadataCatalog.get(cfg.LOADER.TEST_DATASET) self.class_color = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]
def __init__(self, warmup_iter=3): self.logger = setup_logger(__name__) self._warmup_iter = warmup_iter self._step_timer = Timer() self._start_time = time.perf_counter() self._total_timer = Timer()
def __init__(self, cfg, distributed=True): self._distributed = distributed self._cpu_device = torch.device("cpu") self._logger = setup_logger(__name__) self._dataset_name = cfg.LOADER.TEST_DATASET self._metadata = MetadataCatalog.get(self._dataset_name) self._category = self._metadata.get("category_names")
def evaluator(model, data_loader, evaluators): num_devices = dist.get_world_size() _logger = setup_logger(__name__, all_rank=True) total = len(data_loader) # inference data loader must have a fixed length _logger.info(f"Start inference on {total} images") if evaluators is None: evaluators = Evaluators([]) evaluators.reset() timer = Timer(warmup=5, pause=True) total_compute_time = 0 total_time = 0 with inference_context(model), torch.no_grad(): for idx, inputs in enumerate(data_loader): timer.resume() outputs = model(inputs) if torch.cuda.is_available(): torch.cuda.synchronize() timer.pause() evaluators.process(inputs, outputs) if timer.total_seconds() > 10: total_compute_time += timer.seconds() total_time += timer.total_seconds() timer.reset(pause=True) total_seconds_per_img = total_time / (idx + 1) seconds_per_img = total_compute_time / (idx + 1) eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) _logger.info( f"Inference done {idx + 1}/{total}. {seconds_per_img:.4f} s / img. ETA={eta}" ) total_compute_time += timer.seconds() total_time += timer.total_seconds() total_time_str = str(datetime.timedelta(seconds=total_time)) _logger.info( f"Total inference time: {total_time_str} ({total_time / total:.6f} s / img per device, on {num_devices} devices)" ) total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) _logger.info( f"Total inference pure compute time: {total_compute_time_str} ({total_compute_time / total:.6f} s / img per device, on {num_devices} devices)" ) results = evaluators.evaluate() if results is None: results = {} return results
def __init__(self, cfg, distributed=True): self._distributed = distributed self._output_dir = cfg.OUTPUT_DIR if self._output_dir and not os.path.isdir(self._output_dir): os.makedirs(self._output_dir) self._cpu_device = torch.device("cpu") self._logger = setup_logger(__name__) dataset_name = cfg.LOADER.TEST_DATASET self._metadata = MetadataCatalog.get(dataset_name) self._category = self._metadata.get("category_names") with contextlib.redirect_stdout(io.StringIO()): self._coco_api = COCO(self._metadata.json_file) super().__init__(cfg)
def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: # No running_mean/var in early versions # This will silent the warnings if prefix + "running_mean" not in state_dict: state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean) if prefix + "running_var" not in state_dict: state_dict[prefix + "running_var"] = torch.ones_like(self.running_var) if version is not None and version < 3: logger = setup_logger(__name__) logger.info("FrozenBatchNorm {} is upgraded to version 3.".format(prefix.rstrip("."))) # In version < 3, running_var are used without +eps. state_dict[prefix + "running_var"] -= self.eps super()._load_from_state_dict( state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs )
def __init__(self, cfg, distributed=True): self._distributed = distributed self._cpu_device = torch.device("cpu") self._logger = setup_logger(__name__) self._dataset_name = cfg.LOADER.TEST_DATASET self._metadata = MetadataCatalog.get(self._dataset_name) self._category = self._metadata.get("category_names") data_root = self._metadata.get('data_root') self._anno_file_template = os.path.join(data_root, "Annotations", "{}.xml") self._image_set_path = os.path.join( data_root, "ImageSets", "Main", self._metadata.get('split') + ".txt") year = self._metadata.get('year') assert year in [2007, 2012], year self._is_2007 = year == 2007 super().__init__(cfg)
def __init__(self, cfg, checkpointer): self.logger = setup_logger(__name__) self.checkpointer = checkpointer self.period = int(cfg.SOLVER.CHECKPOINT_PERIOD) self.max_to_keep = cfg.SOLVER.CHECKPOINT_KEEP self.recent_checkpoints = []
from vistem import dist from vistem.config import get_cfg from vistem.utils import setup_logger from vistem.engine import launch, default_argument_parser from vistem.modeling import build_model from vistem.loader import build_train_loader, build_test_loader logger = setup_logger(__name__) def main(args): cfg = get_cfg(args.config_file) model = build_model(cfg) train_loader = build_train_loader(cfg) test_loader = build_test_loader(cfg) if dist.is_main_process(): logger.info(f'Model Structure\n{model}') logger.info(f'Backbone Network\n{model.backbone}') logger.debug(f'Backbone Output Shape : {model.backbone.output_shape()}') logger.debug(f'Backbone Output Features : {model.backbone.out_features}') logger.debug(f'Backbone Stride : {model.backbone.out_feature_strides}') logger.debug(f'Backbone Output Channels : {model.backbone.out_feature_channels}') train_iter = iter(train_loader) input_data = next(train_iter) logger.debug(f'Input Data Structure\n{input_data[0]}') total_param = sum(p.numel() for p in model.parameters()) logger.debug(f'The Number of Model Parameters : {total_param}')
def __init__(self, cfg): self._logger = setup_logger(__name__) self._hooks = [] self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER
def __init__(self, cfg): self.logger = setup_logger(__name__) self._period = cfg.TEST.WRITER_PERIOD self._last_write = None