Example #1
0
def reduce_loss_dict(loss_dict):
    """
    Reduce the loss dictionary from all processes so that process with rank
    0 has the averaged results. Returns a dict with the same fields as
    loss_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:
        return loss_dict
    with torch.no_grad():
        loss_names = []
        all_losses = []
        for k in sorted(loss_dict.keys()):
            loss_names.append(k)
            all_losses.append(loss_dict[k])
        all_losses = torch.stack(all_losses, dim=0)
        torch.distributed.reduce(all_losses, dst=0)
        if torch.distributed.get_rank() == 0:
            # only main process gets accumulated, so only divide by
            # world_size in this case
            all_losses /= world_size
        reduced_losses = {k: v for k, v in zip(loss_names, all_losses)}
    return reduced_losses
    def __init__(self, cfg):
        autoaug_list = cfg.AUTOAUG.LIST
        num_policies = cfg.AUTOAUG.NUM_SUBPOLICIES
        max_iters = cfg.SOLVER.MAX_ITER
        scale_splits = cfg.AUTOAUG.SCALE_SPLITS
        box_prob = cfg.AUTOAUG.BOX_PROB

        img_aug_list = autoaug_list[:4]
        img_augs_dict = {
            'zoom_out': {
                'prob': img_aug_list[0] * 0.05,
                'level': img_aug_list[1]
            },
            'zoom_in': {
                'prob': img_aug_list[2] * 0.05,
                'level': img_aug_list[3]
            }
        }
        self.img_augs = Img_augs(img_augs_dict=img_augs_dict)

        box_aug_list = autoaug_list[4:]
        color_aug_types = list(color_aug_func.keys())
        geometric_aug_types = list(geometric_aug_func.keys())
        policies = []
        for i in range(num_policies):
            _start_pos = i * 6
            sub_policy = [
                (
                    color_aug_types[box_aug_list[_start_pos + 0] %
                                    len(color_aug_types)],
                    box_aug_list[_start_pos + 1] * 0.1,
                    box_aug_list[_start_pos + 2],
                ),  # box_color policy
                (geometric_aug_types[box_aug_list[_start_pos + 3] %
                                     len(geometric_aug_types)],
                 box_aug_list[_start_pos + 4] * 0.1,
                 box_aug_list[_start_pos + 5])
            ]  # box_geometric policy
            policies.append(sub_policy)

        _start_pos = num_policies * 6
        scale_ratios = {
            'area': [
                box_aug_list[_start_pos + 0], box_aug_list[_start_pos + 1],
                box_aug_list[_start_pos + 2]
            ],
            'prob': [
                box_aug_list[_start_pos + 3], box_aug_list[_start_pos + 4],
                box_aug_list[_start_pos + 5]
            ]
        }

        box_augs_dict = {'policies': policies, 'scale_ratios': scale_ratios}

        self.box_augs = Box_augs(box_augs_dict=box_augs_dict,
                                 max_iters=max_iters,
                                 scale_splits=scale_splits,
                                 box_prob=box_prob)
        self.max_iters = max_iters

        self.count = 0
        num_gpus = get_world_size()
        self.batch_size = cfg.SOLVER.IMS_PER_BATCH // num_gpus
        self.num_workers = cfg.DATALOADER.NUM_WORKERS
        if self.num_workers == 0:
            self.num_workers += 1
Example #3
0
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
    num_gpus = get_world_size()
    if is_train:
        images_per_batch = cfg.SOLVER.IMS_PER_BATCH
        assert (images_per_batch % num_gpus == 0
                ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number "
        "of GPUs ({}) used.".format(images_per_batch, num_gpus)
        images_per_gpu = images_per_batch // num_gpus
        shuffle = True
        num_iters = cfg.SOLVER.MAX_ITER
    else:
        images_per_batch = cfg.TEST.IMS_PER_BATCH
        assert (images_per_batch % num_gpus == 0
                ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number "
        "of GPUs ({}) used.".format(images_per_batch, num_gpus)
        images_per_gpu = images_per_batch // num_gpus
        shuffle = False if not is_distributed else True
        num_iters = None
        start_iter = 0

    if images_per_gpu > 1:
        logger = logging.getLogger(__name__)
        logger.warning(
            "When using more than one image per GPU you may encounter "
            "an out-of-memory (OOM) error if your GPU does not have "
            "sufficient memory. If this happens, you can reduce "
            "SOLVER.IMS_PER_BATCH (for training) or "
            "TEST.IMS_PER_BATCH (for inference). For training, you must "
            "also adjust the learning rate and schedule length according "
            "to the linear scaling rule. See for example: "
            "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14"
        )

    # group images which have similar aspect ratio. In this case, we only
    # group in two cases: those with width / height > 1, and the other way around,
    # but the code supports more general grouping strategy
    aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []

    paths_catalog = import_file("fcos_core.config.paths_catalog",
                                cfg.PATHS_CATALOG, True)
    DatasetCatalog = paths_catalog.DatasetCatalog
    dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST

    # If bbox aug is enabled in testing, simply set transforms to None and we will apply transforms later
    transforms = None if not is_train and cfg.TEST.BBOX_AUG.ENABLED else build_transforms(
        cfg, is_train)
    datasets = build_dataset(dataset_list, transforms, DatasetCatalog,
                             is_train)

    data_loaders = []
    for dataset in datasets:
        sampler = make_data_sampler(dataset, shuffle, is_distributed)
        batch_sampler = make_batch_data_sampler(dataset, sampler,
                                                aspect_grouping,
                                                images_per_gpu, num_iters,
                                                start_iter)
        collator = BBoxAugCollator() if not is_train and cfg.TEST.BBOX_AUG.ENABLED else \
            BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
        num_workers = cfg.DATALOADER.NUM_WORKERS
        data_loader = torch.utils.data.DataLoader(
            dataset,
            num_workers=num_workers,
            batch_sampler=batch_sampler,
            collate_fn=collator,
        )
        data_loaders.append(data_loader)
    if is_train:
        # during training, a single (possibly concatenated) data_loader is returned
        assert len(data_loaders) == 1
        return data_loaders[0]
    return data_loaders
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
    num_gpus = get_world_size()  #判断gpu数量
    if is_train:
        images_per_batch = cfg.SOLVER.IMS_PER_BATCH  #16 每个batch_size的图片是16张
        assert (images_per_batch % num_gpus ==
                0  #判断每个batch_size的图片可以均匀的分到多个gpu上面
                ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number "
        "of GPUs ({}) used.".format(images_per_batch, num_gpus)
        images_per_gpu = images_per_batch // num_gpus
        shuffle = True
        num_iters = cfg.SOLVER.MAX_ITER  #40000 最大迭代次数不超过40000

    else:
        images_per_batch = cfg.TEST.IMS_PER_BATCH
        assert (images_per_batch % num_gpus == 0
                ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number "
        "of GPUs ({}) used.".format(images_per_batch, num_gpus)
        images_per_gpu = images_per_batch // num_gpus
        shuffle = False if not is_distributed else True
        num_iters = None
        start_iter = 0

    if images_per_gpu > 1:  #提示关于训练过程中的内存不足的问题
        logger = logging.getLogger(__name__)
        logger.warning(
            "When using more than one image per GPU you may encounter "
            "an out-of-memory (OOM) error if your GPU does not have "
            "sufficient memory. If this happens, you can reduce "
            "SOLVER.IMS_PER_BATCH (for training) or "
            "TEST.IMS_PER_BATCH (for inference). For training, you must "
            "also adjust the learning rate and schedule length according "
            "to the linear scaling rule. See for example: "
            "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14"
        )
    #将图片进行分组,仅仅根据两种情形分组,一种是图片的宽/高>1的,一种是其他的.
    # group images which have similar aspect ratio. In this case, we only
    # group in two cases: those with width / height > 1, and the other way around,
    # but the code supports more general grouping strategy
    aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [
    ]  #True

    #PATHS_CATALOG=os.path.join(os.path.dirname(__file__), "paths_catalog.py")
    #找出对应的加载数据集脚本的路径
    paths_catalog = import_file("fcos_core.config.paths_catalog",
                                cfg.PATHS_CATALOG, True)
    #DatasetCatalog 对应的是fcos_core.config.paths_catalog中的DatasetCatalog类,并对其进行实例化.
    DatasetCatalog = paths_catalog.DatasetCatalog  #对应要训练的数据集路径 <class 'fcos_core.config.paths_catalog.DatasetCatalog'>
    dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST  #数据集列表 训练或是测试 对应的列表中的数据集不一样
    # train: ("coco_2014_train", "coco_2014_valminusminival")
    # test:  ("coco_2014_minival",)
    print(dataset_list)

    transforms = build_transforms(cfg, is_train)  #对输入图片进行变换,随机水平分割归一化等操作
    datasets = build_dataset(dataset_list, transforms, DatasetCatalog,
                             is_train)

    data_loaders = []
    for dataset in datasets:
        sampler = make_data_sampler(dataset, shuffle, is_distributed)
        batch_sampler = make_batch_data_sampler(dataset, sampler,
                                                aspect_grouping,
                                                images_per_gpu, num_iters,
                                                start_iter)

        collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
        num_workers = cfg.DATALOADER.NUM_WORKERS
        data_loader = torch.utils.data.DataLoader(
            dataset,
            num_workers=num_workers,
            batch_sampler=batch_sampler,
            collate_fn=collator,
        )
        data_loaders.append(data_loader)
    if is_train:
        # during training, a single (possibly concatenated) data_loader is returned
        assert len(data_loaders) == 1
        return data_loaders[0]
    return data_loaders