コード例 #1
0
ファイル: plain_train_net.py プロジェクト: FateScript/cvpods
def setup(args):
    """
    Create configs and perform basic setups.
    """
    cfg = get_cfg()
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()
    default_setup(
        cfg, args
    )  # if you don't like any of the default setup, write your own setup code
    return cfg
コード例 #2
0
def main(args):
    config.merge_from_list(args.opts)
    cfg, logger = default_setup(config, args)
    if args.debug:
        batches = int(cfg.SOLVER.IMS_PER_DEVICE * args.num_gpus)
        if cfg.SOLVER.IMS_PER_BATCH != batches:
            cfg.SOLVER.IMS_PER_BATCH = batches
            logger.warning(
                "SOLVER.IMS_PER_BATCH is changed to {}".format(batches))

    valid_files = get_valid_files(args, cfg, logger)
    # * means all if need specific format then *.csv
    for current_file in valid_files:
        cfg.MODEL.WEIGHTS = current_file
        model = build_model(cfg)

        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
            cfg.MODEL.WEIGHTS, resume=args.resume)
        if cfg.TEST.AUG.ENABLED:
            res = Trainer.test_with_TTA(cfg, model)
        else:
            res = Trainer.test(cfg, model)

        if comm.is_main_process():
            verify_results(cfg, res)
コード例 #3
0
def main(args):
    config.merge_from_list(args.opts)
    cfg, logger = default_setup(config, args)
    """
    If you'd like to do anything fancier than the standard training logic,
    consider writing your own training loop or subclassing the runner.
    """
    runner = runner_decrator(RUNNERS.get(cfg.TRAINER.NAME))(cfg, build_model)
    runner.resume_or_load(resume=args.resume)

    # check wheather worksapce has enough storeage space
    # assume that a single dumped model is 700Mb
    file_sys = os.statvfs(cfg.OUTPUT_DIR)
    free_space_Gb = (file_sys.f_bfree * file_sys.f_frsize) / 2**30
    eval_space_Gb = (cfg.SOLVER.LR_SCHEDULER.MAX_ITER //
                     cfg.SOLVER.CHECKPOINT_PERIOD) * 700 / 2**10
    if eval_space_Gb > free_space_Gb:
        logger.warning(f"{Fore.RED}Remaining space({free_space_Gb}GB) "
                       f"is less than ({eval_space_Gb}GB){Style.RESET_ALL}")

    if cfg.TEST.AUG.ENABLED:
        runner.register_hooks([
            hooks.EvalHook(0, lambda: runner.test_with_TTA(cfg, runner.model))
        ])

    logger.info("Running with full config:\n{}".format(cfg))
    base_config = cfg.__class__.__base__()
    logger.info("different config with base class:\n{}".format(
        cfg.diff(base_config)))

    if args.eval_only:
        runner.test(cfg, runner.model)
        return
    runner.train()
コード例 #4
0
def stage_main(args, cfg, build):
    logger = logging.getLogger(__name__)
    assert comm.get_world_size() == 1, "DEBUG mode only supported for 1 GPU"

    cfg.merge_from_list(args.opts)
    cfg, logger = default_setup(cfg, args)
    model = build(cfg)
    optimizer = build_optimizer(cfg, model)
    debug_ckpt = Checkpointer(model, resume=True, optimizer=optimizer)
    ckpt_file = args.ckpt_file
    if ckpt_file is None:
        # find latest checkpoint in log dir if ckpt_file is not given
        log_dir = "./log"
        matched_files = [
            os.path.join(log_dir, files) for files in os.listdir(log_dir)
            if re.match("debug_.*.pth", files) is not None
        ]
        ckpt_file = sorted(matched_files, key=os.path.getatime)[-1]

    left_dict = debug_ckpt.load(ckpt_file)
    assert "inputs" in left_dict, "input data not found in checkpoints"
    data = left_dict["inputs"]

    trainer = DebugTrainer(model, data, optimizer)
    logger.info("start run models")
    trainer.run_step()
    logger.info("finish debuging")
コード例 #5
0
ファイル: train_net.py プロジェクト: FateScript/cvpods
def main(args, config, build_model):
    config.merge_from_list(args.opts)
    cfg = default_setup(config, args)
    """
    If you'd like to do anything fancier than the standard training logic,
    consider writing your own training loop or subclassing the runner.
    """
    runner = runner_decrator(RUNNERS.get(cfg.TRAINER.NAME))(cfg, build_model)
    runner.resume_or_load(resume=args.resume)

    extra_hooks = []
    if args.clearml:
        from cvpods.engine.clearml import ClearMLHook
        if comm.is_main_process():
            extra_hooks.append(ClearMLHook())
    if cfg.TEST.AUG.ENABLED:
        extra_hooks.append(
            hooks.EvalHook(0, lambda: runner.test_with_TTA(cfg, runner.model)))
    if extra_hooks:
        runner.register_hooks(extra_hooks)

    logger.info("Running with full config:\n{}".format(cfg))
    base_config = cfg.__class__.__base__()
    logger.info("different config with base class:\n{}".format(
        cfg.diff(base_config)))

    runner.train()

    if comm.is_main_process() and cfg.MODEL.AS_PRETRAIN:
        # convert last ckpt to pretrain format
        convert_to_pretrained_model(input=os.path.join(cfg.OUTPUT_DIR,
                                                       "model_final.pth"),
                                    save_path=os.path.join(
                                        cfg.OUTPUT_DIR,
                                        "model_final_pretrain_weight.pkl"))
コード例 #6
0
def main(args):
    config.merge_from_list(args.opts)
    cfg, logger = default_setup(config, args)
    if args.debug:
        batches = int(cfg.SOLVER.IMS_PER_BATCH / 8 * args.num_gpus)
        if cfg.SOLVER.IMS_PER_BATCH != batches:
            cfg.SOLVER.IMS_PER_BATCH = batches
            logger.warning(
                "SOLVER.IMS_PER_BATCH is changed to {}".format(batches))

    if "MODEL.WEIGHTS" in args.opts:
        if cfg.MODEL.WEIGHTS.endswith(".pth") and not PathManager.exists(
                cfg.MODEL.WEIGHTS):
            ckpt_name = cfg.MODEL.WEIGHTS.split("/")[-1]
            model_prefix = cfg.OUTPUT_DIR.split("cvpods_playground")[1][1:]
            remote_file_path = os.path.join(cfg.OSS.DUMP_PREFIX, model_prefix,
                                            ckpt_name)
            logger.warning(
                f"The specified ckpt file ({cfg.MODEL.WEIGHTS}) was not found locally,"
                f" try to load the corresponding dump file on OSS ({remote_file_path})."
            )
            cfg.MODEL.WEIGHTS = remote_file_path
        valid_files = [cfg.MODEL.WEIGHTS]
    else:
        list_of_files = glob.glob(os.path.join(cfg.OUTPUT_DIR, '*.pth'))

        assert list_of_files, "No checkpoint file found in {}.".format(
            cfg.OUTPUT_DIR)
        list_of_files.sort(key=os.path.getctime)
        latest_file = list_of_files[-1]
        if not args.end_iter:
            valid_files = [latest_file]
        else:
            files = [f for f in list_of_files if str(f) <= str(latest_file)]
            valid_files = []
            for f in files:
                try:
                    model_iter = int(re.split(r'(model_|\.pth)', f)[-3])
                except Exception:
                    logger.warning("remove {}".format(f))
                    continue
                if args.start_iter <= model_iter <= args.end_iter:
                    valid_files.append(f)
            assert valid_files, "No .pth files satisfy your requirement"

    # * means all if need specific format then *.csv
    for current_file in valid_files:
        cfg.MODEL.WEIGHTS = current_file
        model = build_model(cfg)

        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
            cfg.MODEL.WEIGHTS, resume=args.resume)
        res = Trainer.test(cfg, model)
        if comm.is_main_process():
            verify_results(cfg, res)
        if cfg.TEST.AUG.ENABLED:
            res.update(Trainer.test_with_TTA(cfg, model))
コード例 #7
0
def stage_main(args, cfg, build):
    cfg.merge_from_list(args.opts)
    cfg, logger = default_setup(cfg, args)
    model_build_func = build
    """
    If you'd like to do anything fancier than the standard training logic,
    consider writing your own training loop or subclassing the trainer.
    """
    trainer = Trainer(cfg, model_build_func)
    trainer.resume_or_load(resume=args.resume)

    if args.eval_only:
        DefaultCheckpointer(trainer.model,
                            save_dir=cfg.OUTPUT_DIR,
                            resume=args.resume).resume_or_load(
                                cfg.MODEL.WEIGHTS, resume=args.resume)
        res = Trainer.test(cfg, trainer.model)
        if comm.is_main_process():
            verify_results(cfg, res)
        if cfg.TEST.AUG.ENABLED:
            res.update(Trainer.test_with_TTA(cfg, trainer.model))
        return res

    # check wheather worksapce has enough storeage space
    # assume that a single dumped model is 700Mb
    file_sys = os.statvfs(cfg.OUTPUT_DIR)
    free_space_Gb = (file_sys.f_bfree * file_sys.f_frsize) / 2**30
    eval_space_Gb = (cfg.SOLVER.LR_SCHEDULER.MAX_ITER //
                     cfg.SOLVER.CHECKPOINT_PERIOD) * 700 / 2**10
    if eval_space_Gb > free_space_Gb:
        logger.warning(f"{Fore.RED}Remaining space({free_space_Gb}GB) "
                       f"is less than ({eval_space_Gb}GB){Style.RESET_ALL}")

    if cfg.TEST.AUG.ENABLED:
        trainer.register_hooks([
            hooks.EvalHook(0,
                           lambda: trainer.test_with_TTA(cfg, trainer.model))
        ])

    trainer.train()

    if comm.is_main_process() and cfg.MODEL.AS_PRETRAIN:
        # convert last ckpt to pretrain format
        convert_to_pretrained_model(input=os.path.join(cfg.OUTPUT_DIR,
                                                       "model_final.pth"),
                                    save_path=os.path.join(
                                        cfg.OUTPUT_DIR,
                                        "model_final_pretrain_weight.pkl"))