Esempio n. 1
0
def create_callbacks(warmup_schedule, lr_schedule, model, logdir):
    # Create callbacks ...
    callbacks_ = [
        PeriodicCallback(ModelSaver(max_to_keep=20,
                                    keep_checkpoint_every_n_hours=1),
                         every_k_epochs=cfg.TRAIN.CHECKPOINT_PERIOD),
        # linear warmup
        ScheduledHyperParamSetter('learning_rate',
                                  warmup_schedule,
                                  interp='linear',
                                  step_based=True),
        ScheduledHyperParamSetter('learning_rate', lr_schedule),
        GPUMemoryTracker(),
        HostMemoryTracker(),
        ThroughputTracker(samples_per_step=cfg.TRAIN.NUM_GPUS),
        EstimatedTimeLeft(median=True),
        SessionRunTimeout(60000),  # 60000 = 1 minute timeout
        GPUUtilizationTracker()
    ]

    if cfg.TRAIN.EVAL_PERIOD > 0:
        callbacks_.extend([
            EvalCallback(dataset, *model.get_inference_tensor_names(), logdir)
            for dataset in cfg.DATA.VAL  #+ cfg.DATA.TRAIN
        ])
    return callbacks_
Esempio n. 2
0
        total_passes = cfg.TRAIN.LR_SCHEDULE[-1] * 8 / train_dataflow.size()
        logger.info("Total passes of the training set is: {:.5g}".format(total_passes))

        callbacks = [
            PeriodicCallback(
                ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1),
                every_k_epochs=20),
            # linear warmup
            ScheduledHyperParamSetter(
                'learning_rate', warmup_schedule, interp='linear', step_based=True),
            ScheduledHyperParamSetter('learning_rate', lr_schedule),
            PeakMemoryTracker(),
            EstimatedTimeLeft(median=True),
            SessionRunTimeout(60000).set_chief_only(True),   # 1 minute timeout
        ] + [
            EvalCallback(dataset, *MODEL.get_inference_tensor_names(), args.logdir)
            for dataset in cfg.DATA.VAL
        ]
        if not is_horovod:
            callbacks.append(GPUUtilizationTracker())

        if is_horovod and hvd.rank() > 0:
            session_init = None
        else:
            if args.load:
                session_init = get_model_loader(args.load)
            else:
                session_init = get_model_loader(cfg.BACKBONE.WEIGHTS) if cfg.BACKBONE.WEIGHTS else None

        traincfg = TrainConfig(
            model=MODEL,
Esempio n. 3
0
                         every_k_epochs=20),
        # linear warmup
        ScheduledHyperParamSetter('learning_rate',
                                  warmup_schedule,
                                  interp='linear',
                                  step_based=True),
        ScheduledHyperParamSetter('learning_rate', lr_schedule),
        GPUMemoryTracker(),
        HostMemoryTracker(),
        ThroughputTracker(samples_per_step=cfg.TRAIN.NUM_GPUS),
        EstimatedTimeLeft(median=True),
        SessionRunTimeout(60000),  # 1 minute timeout
    ]
    if cfg.TRAIN.EVAL_PERIOD > 0:
        callbacks.extend([
            EvalCallback(dataset, *MODEL.get_inference_tensor_names(),
                         args.logdir) for dataset in cfg.DATA.VAL
        ])
    if not is_horovod:
        callbacks.append(GPUUtilizationTracker())

    if is_horovod and hvd.rank() > 0:
        session_init = None
    else:
        if args.load:
            session_init = get_model_loader(args.load)
        else:
            session_init = get_model_loader(
                cfg.BACKBONE.WEIGHTS) if cfg.BACKBONE.WEIGHTS else None

    traincfg = TrainConfig(model=MODEL,
                           data=QueueInput(train_dataflow),
Esempio n. 4
0
        callbacks = [
            PeriodicCallback(ModelSaver(max_to_keep=10,
                                        keep_checkpoint_every_n_hours=1),
                             every_k_epochs=20),
            # linear warmup
            ScheduledHyperParamSetter('learning_rate',
                                      warmup_schedule,
                                      interp='linear',
                                      step_based=True),
            ScheduledHyperParamSetter('learning_rate', lr_schedule),
            PeakMemoryTracker(),
            EstimatedTimeLeft(median=True),
            SessionRunTimeout(60000).set_chief_only(True),  # 1 minute timeout
        ] + [
            EvalCallback(dataset, *MODEL.get_inference_tensor_names(),
                         args.logdir, 1)  #cfg.TRAIN.BATCH_SIZE_PER_GPU)
            for dataset in cfg.DATA.VAL
        ]
        if not is_horovod:
            callbacks.append(GPUUtilizationTracker())

        callbacks.append(
            ThroughputTracker(cfg.TRAIN.BATCH_SIZE_PER_GPU *
                              cfg.TRAIN.NUM_GPUS,
                              args.images_per_epoch,
                              trigger_every_n_steps=args.throughput_log_freq,
                              log_fn=logger.info))

        if args.tfprof:
            # We only get tf profiling chrome trace on rank==0
            if hvd.rank() == 0: