def train(args, logdir): # model # ;model = Net1() # dataflow # ;df = Net1DataFlow(hp.train1.data_path, hp.train1.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir) session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True, ), ) train_conf = TrainConfig( # ;model=model, # ;data=QueueInput(df(n_prefetch=1000, n_thread=4)), callbacks=[ ModelSaver(checkpoint_dir=logdir), # TODO EvalCallback() ], # ;max_epoch=hp.train1.num_epochs, # ;steps_per_epoch=hp.train1.steps_per_epoch, # session_config=session_conf ) ckpt = '{}/{}'.format( logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(','))
def run_once(self, opt, sess_init=None, save_dir=None): #### train_datagen = self.get_datagen(opt['train_batch_size'], mode='train') valid_datagen = self.get_datagen(opt['infer_batch_size'], mode='valid') ###### must be called before ModelSaver if save_dir is None: logger.set_logger_dir(self.save_dir) else: logger.set_logger_dir(save_dir) ###### model_flags = opt['model_flags'] model = self.get_model()(**model_flags) ###### callbacks = [ ModelSaver(max_to_keep=1, keep_checkpoint_every_n_hours=None), ] for param_name, param_info in opt['manual_parameters'].items(): model.add_manual_variable(param_name, param_info[0]) callbacks.append( ScheduledHyperParamSetter(param_name, param_info[1])) # multi-GPU inference (with mandatory queue prefetch) infs = [StatCollector()] callbacks.append( DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus)))) if self.model_mode == 'seg_gland': callbacks.append(MaxSaver('valid_dice_obj')) elif self.model_mode == 'seg_nuc': callbacks.append(MaxSaver('valid_dice_np')) else: callbacks.append(MaxSaver('valid_auc')) ###### steps_per_epoch = train_datagen.size() // nr_gpus config = TrainConfig( model=model, callbacks=callbacks, dataflow=train_datagen, steps_per_epoch=steps_per_epoch, max_epoch=opt['nr_epochs'], ) config.session_init = sess_init launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus)) tf.reset_default_graph( ) # remove the entire graph in case of multiple runs return
def get_config(model): nr_tower = max(get_nr_gpu(), 1) batch = PER_GPU_BATCH_SIZE logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] callbacks = [ StepTimeCallback(), GPUUtilizationTracker(), ] if args.fake: dataset_train = FakeData([[batch, 224, 224, 3], [batch]], 1000, random=False, dtype=['uint8', 'int32']) input = QueueInput(dataset_train) input = StagingInput(input, nr_stage=1) num_gpus = get_nr_gpu() return TrainConfig( model=model, data=input, callbacks=callbacks, steps_per_epoch=1281 // (PER_GPU_BATCH_SIZE * get_nr_gpu()), max_epoch=3, )
def get_config(X_train, X_valid, y_train, y_valid, model_path=None): data_train, data_valid = get_data(X_train, X_valid, y_train, y_valid) steps_per_epoch = data_train.size() cur_loss = 'tot_loss' triggerk = 15 visualk = 5 config = TrainConfig( model=Model(), dataflow=data_train, callbacks=[ PeriodicTrigger(ModelSaver(), every_k_epochs=triggerk), PeriodicTrigger(MinSaver('validation_' + cur_loss), every_k_epochs=triggerk), ScheduledHyperParamSetter('learning_rate', [(0, 2e-4), (150, 1e-4), (300, 5e-5), (600, 1e-5), (800, 1e-6)], interp='linear'), PeriodicTrigger(VisualizeRunner(), every_k_epochs=visualk), PeriodicTrigger(InferenceRunner(data_valid, [ScalarStats(cur_loss)]), every_k_epochs=5) ], session_init=SaverRestore(model_path) if model_path != None else None, # session_config=session_config, steps_per_epoch=data_train.size(), max_epoch=2000) return config
def get_config(args): if args.gpu != None: NR_GPU = len(args.gpu.split(',')) batch_size = int(args.batch_size) // NR_GPU else: batch_size = int(args.batch_size) ds_train = get_data('train', batch_size) ds_test = get_data('test', batch_size) callbacks = [ ModelSaver(), # ScheduledHyperParamSetter('learning_rate', # [(0, 1e-4), (3, 2e-4), (6, 3e-4), (10, 6e-4), (15, 1e-3), (60, 1e-4), (90, 1e-5)]), HumanHyperParamSetter('learning_rate') ] config = edict() config.stage = 1 return TrainConfig( dataflow = ds_train, callbacks = callbacks, model = Model(stage = 1), max_epoch = cfg.me_max_iteration * batch_size - ds_train.size() )
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData( [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=110, nr_tower=nr_tower )
def get_config(model, fake=False): nr_tower = max(get_num_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) if batch < 32 or batch > 64: logger.warn( "Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported." ) if fake: data = QueueInput( FakeData([[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')) callbacks = [] else: data = QueueInput(get_data('train', batch)) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter('learning_rate', [(0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]), ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] dataset_val = get_data('val', batch) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1281167 // args.batch, max_epoch=105, )
def get_config(fake=False, data_format='NCHW'): nr_tower = max(get_nr_gpu(), 1) global BATCH_SIZE BATCH_SIZE = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = dataset_val = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, BATCH_SIZE)) dataset_train = get_data('train') dataset_val = get_data('val') return TrainConfig( model=Model(data_format=data_format), dataflow=dataset_train, callbacks=[ ModelSaver(), InferenceRunner(dataset_val, [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ]), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ], steps_per_epoch=5000, max_epoch=110, nr_tower=nr_tower)
def get_config(model, fake=False): nr_tower = max(get_num_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] steps_per_epoch = 100 else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_imagenet_dataflow(args.data, 'train', batch) dataset_val = get_imagenet_dataflow(args.data, 'val', min(64, batch)) steps_per_epoch = 1281167 // args.batch BASE_LR = 0.1 * args.batch / 256.0 logger.info("BASELR: {}".format(BASE_LR)) callbacks = [ ModelSaver(), EstimatedTimeLeft(), GPUUtilizationTracker(), ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3)]), ] if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (5 * steps_per_epoch, BASE_LR)], interp='linear', step_based=True)) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=steps_per_epoch, max_epoch=100, )
def get_config(): # prepare dataset step_per_epoch = 1024 dataset_train = get_data('train') dataset_test = get_data('test', 128) sess_config = get_default_sess_config(0.5) nr_gpu = get_nr_gpu() lr = tf.train.exponential_decay( learning_rate=1e-5, global_step=get_global_step_var(), decay_steps=step_per_epoch * 10, decay_rate=0.8, staircase=True, name='learning_rate') tf.scalar_summary('learning_rate', lr) return TrainConfig( dataset=dataset_train, optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-5), callbacks=Callbacks([ StatPrinter(), ModelSaver(), InferenceRunner(dataset_test, ScalarStats('combined_cost')), ]), session_config=sess_config, model=Model(), step_per_epoch=step_per_epoch, max_epoch=1000, )
def get_config(): # prepare dataset dataset_train = get_data('train') step_per_epoch = dataset_train.size() dataset_test = get_data('test') sess_config = get_default_sess_config(0.5) nr_gpu = get_nr_gpu() lr = tf.train.exponential_decay(learning_rate=1e-2, global_step=get_global_step_var(), decay_steps=step_per_epoch * (30 if nr_gpu == 1 else 20), decay_rate=0.5, staircase=True, name='learning_rate') tf.scalar_summary('learning_rate', lr) return TrainConfig( dataset=dataset_train, optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-3), callbacks=Callbacks([ StatPrinter(), ModelSaver(), InferenceRunner(dataset_test, ClassificationError()) ]), session_config=sess_config, model=Model(), step_per_epoch=step_per_epoch, max_epoch=300, )
def get_config(model): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, 3e-1), (30, 3e-2), (60, 3e-3), (90, 3e-4)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=100, nr_tower=nr_tower)
def get_config(args): if args.gpu != None: NR_GPU = len(args.gpu.split(',')) batch_size = int(args.batch_size) // NR_GPU else: batch_size = int(args.batch_size) ds_train = get_data('train', batch_size) ds_test = get_data('test', batch_size) callbacks = [ ModelSaver(), HumanHyperParamSetter('learning_rate') ] if args.stage == 1: max_epoch = cfg.me_max_iteration * batch_size - ds_train.size() elif args.stage == 2: max_epoch = cfg.spmc_max_iteration * batch_size - ds_train.size() else: max_epoch = 99999 return TrainConfig( dataflow = ds_train, callbacks = callbacks, model = Model(stage = args.stage), max_epoch = max_epoch )
def get_config(): # prepare dataset dataset_train = get_data('train') step_per_epoch = dataset_train.size() dataset_test = get_data('test') sess_config = get_default_sess_config(0.9) lr = tf.Variable(0.01, trainable=False, name='learning_rate') tf.scalar_summary('learning_rate', lr) return TrainConfig( dataset=dataset_train, optimizer=tf.train.MomentumOptimizer(lr, 0.9), callbacks=Callbacks([ StatPrinter(), ModelSaver(), InferenceRunner(dataset_test, [ScalarStats('cost'), ClassificationError()]), ScheduledHyperParamSetter('learning_rate', [(1, 0.1), (82, 0.01), (123, 0.001), (300, 0.0002)]) ]), session_config=sess_config, model=Model(n=5), step_per_epoch=step_per_epoch, max_epoch=500, )
def get_config(model): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] callbacks = [ ModelSaver(), GPUUtilizationTracker(), ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear'), ScheduledHyperParamSetter('learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (80, BASE_LR * 1e-3)]), PeriodicTrigger(DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower))), every_k_epochs=1), ] input = QueueInput(dataset_train) input = StagingInput(input, nr_stage=1) return TrainConfig( model=model, data=input, callbacks=callbacks, steps_per_epoch=1281167 // TOTAL_BATCH_SIZE, max_epoch=100, )
def run_once(self, opt, sess_init=None, save_dir=None): #### train_datagen = self.get_datagen(opt["train_batch_size"], mode="train") valid_datagen = self.get_datagen(opt["infer_batch_size"], mode="valid") ###### must be called before ModelSaver if save_dir is None: logger.set_logger_dir(self.save_dir) else: logger.set_logger_dir(save_dir) ###### model_flags = opt["model_flags"] model = self.get_model()(**model_flags) ###### callbacks = [ # ModelSaver(max_to_keep=20), # TODO dynamic this ModelSaver(max_to_keep=opt["nr_epochs"]), # InjectShell(file='/tools/hover_net/src/config.yml', shell='ipython'), ] for param_name, param_info in opt["manual_parameters"].items(): model.add_manual_variable(param_name, param_info[0]) callbacks.append(ScheduledHyperParamSetter(param_name, param_info[1])) # multi-GPU inference (with mandatory queue prefetch) infs = [StatCollector()] callbacks.append( DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus))) ) callbacks.append(MaxSaver("valid_dice")) ###### steps_per_epoch = train_datagen.size() // nr_gpus config = TrainConfig( model=model, callbacks=callbacks, dataflow=train_datagen, steps_per_epoch=steps_per_epoch, max_epoch=opt["nr_epochs"], ) config.session_init = sess_init launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus)) tf.reset_default_graph() # remove the entire graph in case of multiple runs # TODO: save return
def get_config(model, fake=False): start_ = 0 nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) if fake: dataset_train = FakeData( [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8') callbacks = [] else: dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) if start_ < 31: lr_setting =[(max(30-start_, 0) , BASE_LR * 1e-1), (60 - start_, BASE_LR * 1e-2),( 90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] elif start_ < 61: lr_setting =[(max(60 - start_, 0), BASE_LR * 1e-2),( 90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] elif start_ < 91: lr_setting =[(max(90 - start_, 0), BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] else: print('not found learning rate setting!!!!!!!!!!!!!') callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', lr_setting), # TensorPrinter(['tower1/group3/block2/conv2/Abs_0', 'tower1/group3/block2/conv2/Abs_1:0', 'tower1/group3/block2/conv2/Abs_2:0']) ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=120, )
def pred_config(self, args, df, callbacks) -> TrainConfig: return TrainConfig( model=self.train_model(args), data=StagingInput(QueueInput(df)), callbacks=callbacks, max_epoch=args.epochs, steps_per_epoch=args.steps, session_init=SaverRestore(args.load) if args.load else None, )
def get_config(model): nr_tower = max(get_num_gpu(), 1) batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) callbacks = [ThroughputTracker(args.batch)] if args.fake: data = QueueInput(FakeData( [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')) else: data = QueueInput( get_imagenet_dataflow(args.data, 'train', batch), # use a larger queue queue=tf.FIFOQueue(200, [tf.uint8, tf.int32], [[batch, 224, 224, 3], [batch]]) ) BASE_LR = 30 SCALED_LR = BASE_LR * (args.batch / 256.0) callbacks.extend([ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', [ (0, SCALED_LR), (60, SCALED_LR * 1e-1), (70, SCALED_LR * 1e-2), (80, SCALED_LR * 1e-3), (90, SCALED_LR * 1e-4), ]), ]) dataset_val = get_imagenet_dataflow(args.data, 'val', 64) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) if args.load.endswith(".npz"): # a released model in npz format init = SmartInit(args.load) else: # a pre-trained checkpoint init = SaverRestore(args.load, ignore=("learning_rate", "global_step")) return TrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1281167 // args.batch, session_init=init, max_epoch=100, )
def run_once(self, nr_gpus, freeze, sess_init=None, save_dir=None): #### train_datagen = self.get_datagen(mode='train') valid_datagen = self.get_datagen(mode='valid') ###### must be called before ModelSaver if save_dir is None: logger.set_logger_dir(self.save_dir) else: logger.set_logger_dir(save_dir) callbacks = [ ModelSaver(max_to_keep=200), ScheduledHyperParamSetter('learning_rate', self.lr_sched), ] ###### # multi-GPU inference (with mandatory queue prefetch) infs = [StatCollector()] callbacks.append( DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus)))) ###### steps_per_epoch = train_datagen.size() // nr_gpus MODEL_MAKER = Model_NP_XY if self.model_mode == 'np+xy' else Model_NP_DIST config = TrainConfig( model=MODEL_MAKER(freeze), callbacks=callbacks, dataflow=train_datagen, steps_per_epoch=steps_per_epoch, max_epoch=self.nr_epochs, ) config.session_init = sess_init launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus)) tf.reset_default_graph( ) # remove the entire graph in case of multiple runs return
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) BASE_LR = 0.1 * (args.batch / 256.0) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (85, BASE_LR * 1e-3), (95, BASE_LR * 1e-4), (105, BASE_LR * 1e-5)]), ] if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear')) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=110, )
def get_config(): #anchors = np.mgrid[0:4,0:4][:,1:,1:].transpose(1,2,0).reshape((-1,2)) / 4.0 # prepare dataset d1 = dataset.SVHNDigit('train') d2 = dataset.SVHNDigit('extra') train = RandomMixData([d1, d2]) test = dataset.SVHNDigit('test') augmentors = [ imgaug.Resize((40, 40)), imgaug.BrightnessAdd(30), imgaug.Contrast((0.5, 1.5)), imgaug.GaussianDeform( # this is slow [(0.2, 0.2), (0.2, 0.8), (0.8, 0.8), (0.8, 0.2)], (40, 40), 0.2, 3), ] train = AugmentImageComponent(train, augmentors) train = BatchData(train, 128) nr_proc = 5 train = PrefetchData(train, 5, nr_proc) step_per_epoch = train.size() augmentors = [ imgaug.Resize((40, 40)), ] test = AugmentImageComponent(test, augmentors) test = BatchData(test, 128, remainder=True) sess_config = get_default_sess_config(0.8) lr = tf.train.exponential_decay(learning_rate=1e-3, global_step=get_global_step_var(), decay_steps=train.size() * 60, decay_rate=0.2, staircase=True, name='learning_rate') tf.scalar_summary('learning_rate', lr) return TrainConfig( dataset=train, optimizer=tf.train.AdamOptimizer(lr), callbacks=Callbacks([ StatPrinter(), ModelSaver(), InferenceRunner(dataset_test, [ScalarStats('cost'), ClassificationError()]) ]), session_config=sess_config, model=Model(), step_per_epoch=step_per_epoch, max_epoch=350, )
def get_config(model): batch = 1 logger.info("For benchmark, batch size is fixed to 1 per tower.") data = QueueInput( FakeData([[1, 224, 224, 3], [1]], 1, random=False, dtype='uint8')) return TrainConfig(model=model, data=data, callbacks=[], steps_per_epoch=1, max_epoch=1)
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5), (80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)]), HumanHyperParamSetter('learning_rate'), ] # Finetune COCO #[(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5),(80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)] #JT COCO #[(0, 2.5e-4), (20, 1.25e-4), (40, 5e-5), (60, 2.5e-5),(80, 1e-5), (100, 5e-6), (120, 2.5e-6)] #Fintune to VOC #[(0, 1.25e-4), (20, 5e-5), (40, 2.5e-5), (60,1.25e-5),(80, 5e-6), (100, 2.5e-6), (120, 1.25e-6)] #infs = [ClassificationError('wrong-top1', 'val-error-top1'), # ClassificationError('wrong-top5', 'val-error-top5')] infs = [ ClassificationError('loss-wrong-top1', 'loss-val-error-top1'), ClassificationError('loss-wrong-top5', 'loss-val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=1522, max_epoch=140, nr_tower=nr_tower)
def get_config(model): input_sig = model.get_input_signature() nr_tower = max(hvd.size(), 1) batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) callbacks = [ThroughputTracker(args.batch), UpdateMomentumEncoder()] if args.fake: data = QueueInput( FakeData([x.shape for x in input_sig], 1000, random=False, dtype='uint8')) else: zmq_addr = 'ipc://@imagenet-train-b{}'.format(batch) data = ZMQInput(zmq_addr, 25, bind=False) dataset = data.to_dataset(input_sig).repeat().prefetch(15) dataset = dataset.apply( tf.data.experimental.prefetch_to_device('/gpu:0')) data = TFDatasetInput(dataset) callbacks.extend([ ModelSaver(), EstimatedTimeLeft(), ]) if not args.v2: # step-wise LR in v1 SCALED_LR = BASE_LR * (args.batch / 256.0) callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, min(BASE_LR, SCALED_LR)), (120, SCALED_LR * 1e-1), (160, SCALED_LR * 1e-2)])) if SCALED_LR > BASE_LR: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR), (5, SCALED_LR)], interp='linear')) return TrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1281167 // args.batch, max_epoch=200, )
def get_config(model, fake=False, xla=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(10, 1e-2), (20, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) config = tf.ConfigProto() jit_level = 0 if xla: # Turns on XLA JIT compilation jit_level = tf.OptimizerOptions.ON_1 config.graph_options.optimizer_options.global_jit_level = jit_level return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=10, max_epoch=1, nr_tower=nr_tower)
def get_config(args, model, num_gpus, num_towers): """ Create the TensorPack Trainer configuration. :param args: The cli arguments. :param model: The model object to train. :param num_gpus: The number of gpus on which to train :param num_towers: The number of data parallel towers to create :return: A TrainConfig object. """ logger.info("Running on {} towers. Batch size per tower: {}".format( num_towers, args.batch_size)) df_train = avatar_synth_df(args.train_dir, args.batch_size, args.num_threads) df_test = avatar_synth_df(args.test_dir, args.batch_size, args.num_threads) def update_lr(epoch, cur_lr): """ Approximate exponential decay of the learning rate """ if args.resume_lr: return cur_lr * args.lr_decay else: return args.lr * args.lr_decay**epoch callbacks = [ cb.ModelSaver(), cb.MinSaver('val-error-top1'), cb.HyperParamSetterWithFunc('tower0/Avatar_Synth/LR:0', update_lr), cb.MergeAllSummaries(period=args.summary_freq), ] infs = [cb.ScalarStats('Avatar_Synth/Cost')] if num_gpus > 0: callbacks.append(cb.GPUUtilizationTracker()) if num_towers == 1: # single-GPU inference with queue prefetch callbacks.append(cb.InferenceRunner(QueueInput(df_test), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( cb.DataParallelInferenceRunner(df_test, infs, list(range(num_towers)))) return TrainConfig(model=model, dataflow=df_train, callbacks=callbacks, max_epoch=args.epochs, nr_tower=num_towers)
def get_config(model, checkpoint_dir, target_shape, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, target_shape, target_shape, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch, target_shape) dataset_val = get_data('val', batch, target_shape) callbacks = [ ModelSaver(checkpoint_dir=checkpoint_dir), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) # 7.5 it / sec testing return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 300, #5000 max_epoch=110, nr_tower=nr_tower)
def get_config(model): nr_tower = get_nr_gpu() logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, args.batch_size_per_gpu)) dataset_train = get_data('train', args.batch_size_per_gpu) dataset_val = get_data('val', args.batch_size_per_gpu) BASE_LR = 1e-3 * (args.batch_size_per_gpu * nr_tower / 256.0) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR), (60, BASE_LR * 1e-1), (90, BASE_LR * 1e-2)]), HumanHyperParamSetter('learning_rate'), ] ''' if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear')) ''' infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=1280000 // (args.batch_size_per_gpu * nr_tower), max_epoch=110, )
def get_config(lmdb_path, txt_path, nr_gpu): df = ResNet.get_data(lmdb_path, txt_path) return TrainConfig( model=ResNet(), dataflow=df, callbacks=[ModelSaver()], extra_callbacks=[ MovingAverageSummary(), ProgressBar(["EMA/cost", "EMA/loss/pos-dist", "EMA/loss/neg-dist"]), MergeAllSummaries(), RunUpdateOps() ], nr_tower=nr_gpu, steps_per_epoch=1800, max_epoch=30, session_config=tf.ConfigProto(allow_soft_placement=True))