Esempio n. 1
0
def _distributed_worker(local_rank, main_func, world_size,
                        num_gpus_per_machine, machine_rank, dist_url, args):
    assert torch.cuda.is_available(
    ), "cuda is not available. Please check your installation."
    global_rank = machine_rank * num_gpus_per_machine + local_rank
    try:
        dist.init_process_group(backend="NCCL",
                                init_method=dist_url,
                                world_size=world_size,
                                rank=global_rank)
    except Exception as e:
        logger = setup_logger(__name__)
        logger.error("Process group URL: {}".format(dist_url))
        raise e

    dist.synchronize()

    assert num_gpus_per_machine <= torch.cuda.device_count()
    torch.cuda.set_device(local_rank)

    # Setup the local process group (which contains ranks within the same machine)
    assert dist._LOCAL_PROCESS_GROUP is None
    num_machines = world_size // num_gpus_per_machine
    for i in range(num_machines):
        ranks_on_i = list(
            range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
        pg = dist.new_group(ranks_on_i)
        if i == machine_rank:
            dist._LOCAL_PROCESS_GROUP = pg

    main_func(*args)
Esempio n. 2
0
def launch(main_func,
           num_gpus_per_machine,
           num_machines=1,
           machine_rank=0,
           dist_ip=None,
           dist_port=None,
           args=()):
    world_size = num_machines * num_gpus_per_machine
    if world_size == 1:
        main_func(*args)

    else:
        if machine_rank == 0:
            dist_ip = "127.0.0.1"
            if dist_port is None: dist_port = _find_free_port()

        else:
            assert num_machines > 1, "At least 2 machines is needed"
            assert dist_ip is not None, "Set main machine ip address"
            assert dist_port is not None, "Set dist port number which is same with main machine port"

        dist_url = f"tcp://{dist_ip}:{dist_port}"
        logger = setup_logger(__name__)
        logger.info(f"pytorch distribute url : {dist_url}")

        mp.spawn(
            _distributed_worker,
            nprocs=num_gpus_per_machine,
            args=(main_func, world_size, num_gpus_per_machine, machine_rank,
                  dist_url, args),
            daemon=False,
        )
Esempio n. 3
0
def main(args):
    _logger = setup_logger(__name__)

    cfg = get_cfg(args.config_file)
    if cfg.SEED < 0 : cfg.SEED = dist.shared_random_seed()
    
    _logger.debug(f'Config File : \n{cfg}')
Esempio n. 4
0
    def __init__(self, cfg):
        self._logger = setup_logger(__name__, all_rank=True)
        
        if dist.is_main_process():
            self._logger.debug(f'Config File : \n{cfg}')
            if cfg.VISUALIZE_DIR and not os.path.isdir(cfg.VISUALIZE_DIR) : os.makedirs(cfg.VISUALIZE_DIR)
            self.visualize_dir = cfg.VISUALIZE_DIR
        dist.synchronize()
        
        self.test_loader = build_test_loader(cfg)

        self.model = build_model(cfg)
        self.model.eval()
        if dist.is_main_process():
            self._logger.debug(f"Model Structure\n{self.model}")
                
        if dist.get_world_size() > 1:
            self.model = DistributedDataParallel(self.model, device_ids=[dist.get_local_rank()], broadcast_buffers=False)

        self.checkpointer = Checkpointer(
            self.model,
            cfg.OUTPUT_DIR,
        )
        self.checkpointer.load(cfg.WEIGHTS)

        self.meta_data = MetadataCatalog.get(cfg.LOADER.TEST_DATASET)
        self.class_color = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]
Esempio n. 5
0
    def __init__(self, warmup_iter=3):
        self.logger = setup_logger(__name__)

        self._warmup_iter = warmup_iter
        self._step_timer = Timer()
        self._start_time = time.perf_counter()
        self._total_timer = Timer()
Esempio n. 6
0
    def __init__(self, cfg, distributed=True):
        self._distributed = distributed

        self._cpu_device = torch.device("cpu")
        self._logger = setup_logger(__name__)

        self._dataset_name = cfg.LOADER.TEST_DATASET
        self._metadata = MetadataCatalog.get(self._dataset_name)

        self._category = self._metadata.get("category_names")
Esempio n. 7
0
def evaluator(model, data_loader, evaluators):
    num_devices = dist.get_world_size()
    _logger = setup_logger(__name__, all_rank=True)

    total = len(data_loader)  # inference data loader must have a fixed length
    _logger.info(f"Start inference on {total} images")

    if evaluators is None: evaluators = Evaluators([])
    evaluators.reset()

    timer = Timer(warmup=5, pause=True)
    total_compute_time = 0
    total_time = 0

    with inference_context(model), torch.no_grad():
        for idx, inputs in enumerate(data_loader):
            timer.resume()
            outputs = model(inputs)
            if torch.cuda.is_available(): torch.cuda.synchronize()
            timer.pause()
            evaluators.process(inputs, outputs)

            if timer.total_seconds() > 10:
                total_compute_time += timer.seconds()
                total_time += timer.total_seconds()
                timer.reset(pause=True)

                total_seconds_per_img = total_time / (idx + 1)
                seconds_per_img = total_compute_time / (idx + 1)
                eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                     (total - idx - 1)))
                _logger.info(
                    f"Inference done {idx + 1}/{total}. {seconds_per_img:.4f} s / img. ETA={eta}"
                )

    total_compute_time += timer.seconds()
    total_time += timer.total_seconds()

    total_time_str = str(datetime.timedelta(seconds=total_time))
    _logger.info(
        f"Total inference time: {total_time_str} ({total_time / total:.6f} s / img per device, on {num_devices} devices)"
    )

    total_compute_time_str = str(
        datetime.timedelta(seconds=int(total_compute_time)))
    _logger.info(
        f"Total inference pure compute time: {total_compute_time_str} ({total_compute_time / total:.6f} s / img per device, on {num_devices} devices)"
    )

    results = evaluators.evaluate()
    if results is None: results = {}
    return results
Esempio n. 8
0
    def __init__(self, cfg, distributed=True):
        self._distributed = distributed
        self._output_dir = cfg.OUTPUT_DIR
        if self._output_dir and not os.path.isdir(self._output_dir):
            os.makedirs(self._output_dir)

        self._cpu_device = torch.device("cpu")
        self._logger = setup_logger(__name__)

        dataset_name = cfg.LOADER.TEST_DATASET
        self._metadata = MetadataCatalog.get(dataset_name)

        self._category = self._metadata.get("category_names")
        with contextlib.redirect_stdout(io.StringIO()):
            self._coco_api = COCO(self._metadata.json_file)

        super().__init__(cfg)
Esempio n. 9
0
    def _load_from_state_dict(
        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
    ):
        version = local_metadata.get("version", None)

        if version is None or version < 2:
            # No running_mean/var in early versions
            # This will silent the warnings
            if prefix + "running_mean" not in state_dict:
                state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
            if prefix + "running_var" not in state_dict:
                state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)

        if version is not None and version < 3:
            logger = setup_logger(__name__)
            logger.info("FrozenBatchNorm {} is upgraded to version 3.".format(prefix.rstrip(".")))
            # In version < 3, running_var are used without +eps.
            state_dict[prefix + "running_var"] -= self.eps

        super()._load_from_state_dict(
            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
        )
Esempio n. 10
0
    def __init__(self, cfg, distributed=True):
        self._distributed = distributed

        self._cpu_device = torch.device("cpu")
        self._logger = setup_logger(__name__)

        self._dataset_name = cfg.LOADER.TEST_DATASET
        self._metadata = MetadataCatalog.get(self._dataset_name)

        self._category = self._metadata.get("category_names")

        data_root = self._metadata.get('data_root')
        self._anno_file_template = os.path.join(data_root, "Annotations",
                                                "{}.xml")
        self._image_set_path = os.path.join(
            data_root, "ImageSets", "Main",
            self._metadata.get('split') + ".txt")

        year = self._metadata.get('year')
        assert year in [2007, 2012], year
        self._is_2007 = year == 2007

        super().__init__(cfg)
Esempio n. 11
0
 def __init__(self, cfg, checkpointer):
     self.logger = setup_logger(__name__)
     self.checkpointer = checkpointer
     self.period = int(cfg.SOLVER.CHECKPOINT_PERIOD)
     self.max_to_keep = cfg.SOLVER.CHECKPOINT_KEEP
     self.recent_checkpoints = []
Esempio n. 12
0
from vistem import dist
from vistem.config import get_cfg
from vistem.utils import setup_logger
from vistem.engine import launch, default_argument_parser

from vistem.modeling import build_model
from vistem.loader import build_train_loader, build_test_loader

logger = setup_logger(__name__)

def main(args):
    cfg = get_cfg(args.config_file)

    model = build_model(cfg)
    train_loader = build_train_loader(cfg)
    test_loader = build_test_loader(cfg)

    if dist.is_main_process():
        logger.info(f'Model Structure\n{model}')
        logger.info(f'Backbone Network\n{model.backbone}')
        logger.debug(f'Backbone Output Shape : {model.backbone.output_shape()}')
        logger.debug(f'Backbone Output Features : {model.backbone.out_features}')
        logger.debug(f'Backbone Stride : {model.backbone.out_feature_strides}')
        logger.debug(f'Backbone Output Channels : {model.backbone.out_feature_channels}')
        
        train_iter = iter(train_loader)
        input_data = next(train_iter)
        logger.debug(f'Input Data Structure\n{input_data[0]}')

        total_param = sum(p.numel() for p in model.parameters())
        logger.debug(f'The Number of Model Parameters : {total_param}')
Esempio n. 13
0
    def __init__(self, cfg):
        self._logger = setup_logger(__name__)
        self._hooks = []

        self.start_iter = 0
        self.max_iter = cfg.SOLVER.MAX_ITER
Esempio n. 14
0
 def __init__(self, cfg):
     self.logger = setup_logger(__name__)
     self._period = cfg.TEST.WRITER_PERIOD
     self._last_write = None