def create_callbacks(warmup_schedule, lr_schedule, model, logdir): # Create callbacks ... callbacks_ = [ PeriodicCallback(ModelSaver(max_to_keep=20, keep_checkpoint_every_n_hours=1), every_k_epochs=cfg.TRAIN.CHECKPOINT_PERIOD), # linear warmup ScheduledHyperParamSetter('learning_rate', warmup_schedule, interp='linear', step_based=True), ScheduledHyperParamSetter('learning_rate', lr_schedule), GPUMemoryTracker(), HostMemoryTracker(), ThroughputTracker(samples_per_step=cfg.TRAIN.NUM_GPUS), EstimatedTimeLeft(median=True), SessionRunTimeout(60000), # 60000 = 1 minute timeout GPUUtilizationTracker() ] if cfg.TRAIN.EVAL_PERIOD > 0: callbacks_.extend([ EvalCallback(dataset, *model.get_inference_tensor_names(), logdir) for dataset in cfg.DATA.VAL #+ cfg.DATA.TRAIN ]) return callbacks_
total_passes = cfg.TRAIN.LR_SCHEDULE[-1] * 8 / train_dataflow.size() logger.info("Total passes of the training set is: {:.5g}".format(total_passes)) callbacks = [ PeriodicCallback( ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1), every_k_epochs=20), # linear warmup ScheduledHyperParamSetter( 'learning_rate', warmup_schedule, interp='linear', step_based=True), ScheduledHyperParamSetter('learning_rate', lr_schedule), PeakMemoryTracker(), EstimatedTimeLeft(median=True), SessionRunTimeout(60000).set_chief_only(True), # 1 minute timeout ] + [ EvalCallback(dataset, *MODEL.get_inference_tensor_names(), args.logdir) for dataset in cfg.DATA.VAL ] if not is_horovod: callbacks.append(GPUUtilizationTracker()) if is_horovod and hvd.rank() > 0: session_init = None else: if args.load: session_init = get_model_loader(args.load) else: session_init = get_model_loader(cfg.BACKBONE.WEIGHTS) if cfg.BACKBONE.WEIGHTS else None traincfg = TrainConfig( model=MODEL,
every_k_epochs=20), # linear warmup ScheduledHyperParamSetter('learning_rate', warmup_schedule, interp='linear', step_based=True), ScheduledHyperParamSetter('learning_rate', lr_schedule), GPUMemoryTracker(), HostMemoryTracker(), ThroughputTracker(samples_per_step=cfg.TRAIN.NUM_GPUS), EstimatedTimeLeft(median=True), SessionRunTimeout(60000), # 1 minute timeout ] if cfg.TRAIN.EVAL_PERIOD > 0: callbacks.extend([ EvalCallback(dataset, *MODEL.get_inference_tensor_names(), args.logdir) for dataset in cfg.DATA.VAL ]) if not is_horovod: callbacks.append(GPUUtilizationTracker()) if is_horovod and hvd.rank() > 0: session_init = None else: if args.load: session_init = get_model_loader(args.load) else: session_init = get_model_loader( cfg.BACKBONE.WEIGHTS) if cfg.BACKBONE.WEIGHTS else None traincfg = TrainConfig(model=MODEL, data=QueueInput(train_dataflow),
callbacks = [ PeriodicCallback(ModelSaver(max_to_keep=10, keep_checkpoint_every_n_hours=1), every_k_epochs=20), # linear warmup ScheduledHyperParamSetter('learning_rate', warmup_schedule, interp='linear', step_based=True), ScheduledHyperParamSetter('learning_rate', lr_schedule), PeakMemoryTracker(), EstimatedTimeLeft(median=True), SessionRunTimeout(60000).set_chief_only(True), # 1 minute timeout ] + [ EvalCallback(dataset, *MODEL.get_inference_tensor_names(), args.logdir, 1) #cfg.TRAIN.BATCH_SIZE_PER_GPU) for dataset in cfg.DATA.VAL ] if not is_horovod: callbacks.append(GPUUtilizationTracker()) callbacks.append( ThroughputTracker(cfg.TRAIN.BATCH_SIZE_PER_GPU * cfg.TRAIN.NUM_GPUS, args.images_per_epoch, trigger_every_n_steps=args.throughput_log_freq, log_fn=logger.info)) if args.tfprof: # We only get tf profiling chrome trace on rank==0 if hvd.rank() == 0: