Esempio n. 1
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.deprecated.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT,
                                              resume=cfg.SOLVER.RESUME)
    if cfg.SOLVER.RESUME:
        arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    tb_logger = Logger(cfg.OUTPUT_DIR)
    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        tb_logger,
        cfg,
    )

    return model
Esempio n. 2
0
def main(args):
    # parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
    # parser.add_argument(
    #     "--config-file",
    #     default="",
    #     metavar="FILE",
    #     help="path to config file",
    #     type=str,
    # )
    # parser.add_argument("--local-rank", type=int, default=0)
    # parser.add_argument(
    #     "--skip-test",
    #     dest="skip_test",
    #     help="Do not test the final model",
    #     action="store_true",
    # )
    # parser.add_argument(
    #     "opts",
    #     help="Modify config options using the command-line",
    #     default=None,
    #     nargs=argparse.REMAINDER,
    # )
    # parser.add_argument(
    #     "--eval-only", action="store_true", help="perform evaluation only"
    # )
    # parser.add_argument(
    #     "--no-color", action="store_true", help="disable colorful logging"
    # )
    # parser.add_argument(
    #     "--num-gpus", type=int, default=1, help="number of gpus per machine"
    # )
    # parser.add_argument("--num-machines", type=int, default=1)
    # parser.add_argument(
    #     "--machine-rank",
    #     type=int,
    #     default=0,
    #     help="the rank of this machine (unique per machine)",
    # )
    # port = 2 ** 15 + 2 ** 14 + hash(os.getuid()) % 2 ** 14
    # parser.add_argument("--dist-url", default="tcp://127.0.0.1:{}".format(port))
    # parser.add_argument(
    #     "opts",
    #     help="Modify config options using the command-line",
    #     default=None,
    #     nargs=argparse.REMAINDER,
    # )
    #
    # args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    # num_gpus = args.num_gpus
    args.distributed = num_gpus > 1
    # args.distributed = get_world_size() > 1
    args.local_rank = get_rank() % args.num_gpus

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")

    # distributed = get_world_size() > 1
    # args.distributed = distributed
    # if distributed:
    #     args.local_rank = get_rank() % args.num_gpus

    print(args.config_file)
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(args.num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    tb_logger = Logger(cfg.OUTPUT_DIR, get_rank())
    train(cfg, args.local_rank, args.distributed, tb_logger)