def __init__(self, config): """ GANTrainer expects a ModelDesc in config which sets the following attribute after :meth:`_build_graph`: g_loss, d_loss, g_vars, d_vars. """ input = QueueInput(config.dataflow) model = config.model cbs = input.setup(model.get_inputs_desc()) config.callbacks.extend(cbs) with TowerContext('', is_training=True): model.build_graph(input) opt = model.get_optimizer() # by default, run one d_min after one g_min with tf.name_scope('optimize'): g_min = opt.minimize(model.g_loss, var_list=model.g_vars, name='g_op') with tf.control_dependencies([g_min]): d_min = opt.minimize(model.d_loss, var_list=model.d_vars, name='d_op') self.train_op = d_min super(GANTrainer, self).__init__(config)
def get_config(model, fake=False): nr_tower = max(get_num_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) if batch < 32 or batch > 64: logger.warn( "Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported." ) if fake: data = QueueInput( FakeData([[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')) callbacks = [] else: data = QueueInput(get_data('train', batch)) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter('learning_rate', [(0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]), ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] dataset_val = get_data('val', batch) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1281167 // args.batch, max_epoch=105, )
def __init__(self, config, d_period=1, g_period=1): """ Args: d_period(int): period of each d_opt run g_period(int): period of each g_opt run """ self._d_period = int(d_period) self._g_period = int(g_period) assert min(d_period, g_period) == 1 input = QueueInput(config.dataflow) model = config.model cbs = input.setup(model.get_inputs_desc()) config.callbacks.extend(cbs) with TowerContext('', is_training=True): model.build_graph(input) opt = model.get_optimizer() with tf.name_scope('optimize'): self.d_min = opt.minimize(model.d_loss, var_list=model.d_vars, name='d_min') self.g_min = opt.minimize(model.g_loss, var_list=model.g_vars, name='g_min') super(SeparateGANTrainer, self).__init__(config)
def fit(self, data): """Fit the model to the given data. Args: data(pandas.DataFrame): dataset to fit the model. Returns: None """ self.preprocessor = Preprocessor( continuous_columns=self.continuous_columns) data = self.preprocessor.fit_transform(data) self.metadata = self.preprocessor.metadata dataflow = TGANDataFlow(data, self.metadata) batch_data = BatchData(dataflow, self.batch_size) input_queue = QueueInput(batch_data) self.model = self.get_model(training=True) from tensorpack.callbacks import CometMLMonitor trainer = SeparateGANTrainer( model=self.model, input_queue=input_queue, g_period=6, ) self.restore_path = os.path.join(self.model_dir, 'checkpoint') if os.path.isfile(self.restore_path) and self.restore_session: session_init = SaverRestore(self.restore_path) with open(os.path.join(self.log_dir, 'stats.json')) as f: starting_epoch = json.load(f)[-1]['epoch_num'] + 1 else: session_init = None starting_epoch = 1 action = 'k' if self.restore_session else 'd' # logger.set_logger_dir(self.log_dir, action=action) callbacks = [] monitors = [] if self.save_checkpoints: callbacks.append(ModelSaver(checkpoint_dir=self.model_dir)) callbacks.append(MergeAllSummaries(period=10)) if self.experiment is not None: monitors.append(CometMLMonitor(experiment=self.experiment)) trainer.train_with_defaults(callbacks=callbacks, monitors=monitors, steps_per_epoch=self.steps_per_epoch, max_epoch=self.max_epoch, session_init=session_init, starting_epoch=starting_epoch) self.prepare_sampling()
def get_config(model, fake=False): nr_tower = max(get_num_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] steps_per_epoch = 100 else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_imagenet_dataflow(args.data, 'train', batch) dataset_val = get_imagenet_dataflow(args.data, 'val', min(64, batch)) steps_per_epoch = 1281167 // args.batch BASE_LR = 0.1 * args.batch / 256.0 logger.info("BASELR: {}".format(BASE_LR)) callbacks = [ ModelSaver(), EstimatedTimeLeft(), GPUUtilizationTracker(), ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3)]), ] if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (5 * steps_per_epoch, BASE_LR)], interp='linear', step_based=True)) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=steps_per_epoch, max_epoch=100, )
def __init__(self, config): self._nr_gpu = config.nr_tower assert self._nr_gpu > 1 self._raw_devices = ['/gpu:{}'.format(k) for k in config.tower] self._input_source = StagingInputWrapper(QueueInput(config.dataflow), self._raw_devices) super(MultiGPUGANTrainer, self).__init__(config)
def get_config(model, nr_tower): batch = TOTAL_BATCH_SIZE // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, 3e-1), (30, 3e-2), (60, 3e-3), (90, 3e-4)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=100, )
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData( [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=110, nr_tower=nr_tower )
def __init__(self, config, d_interval=1): """ Args: d_interval: will run d_opt only after this many of g_opt. """ self._input_method = QueueInput(config.dataflow) self._d_interval = d_interval super(SeparateGANTrainer, self).__init__(config)
def train(args): data_folder = args.get("data_folder") save_folder = args.get("save_folder") image_size = args.get("image_size") max_epoch = args.get("max_epoch") save_epoch = args.get("save_epoch") or max_epoch // 10 # Scale lr and steps_per_epoch accordingly. # Make sure the total number of gradient evaluations is consistent. n_gpu = args.get("n_gpu") or 1 batch_size = args.get("batch_size") or BATCH equi_batch_size = max(n_gpu, 1) * batch_size lr = args.get("lr") or LR lr *= equi_batch_size steps_per_epoch = args.get("steps_per_epoch") or 1000 steps_per_epoch /= equi_batch_size image_steps = args.get("image_steps") or steps_per_epoch // 10 scalar_steps = args.get("scalar_steps") if scalar_steps > 0: scalar_steps = max(scalar_steps // equi_batch_size, 1) else: scalar_steps = 0 # merge scalar summary every epoch # lr starts decreasing at half of max epoch start_dec_epoch = max_epoch // 2 # stops when lr is 0.01 of its initial value end_epoch = max_epoch - int((max_epoch - start_dec_epoch) * 0.01) # adjust noise input range according to the input act zmin, zmax = (0, 1) if args.get("act_input") == "identity" else (-1, 1) if save_folder == None: logger.auto_set_dir() else: logger.set_logger_dir(save_folder) df = get_data(data_folder, image_size, zmin=zmin, zmax=zmax, batch=batch_size) df = PrintData(df) data = QueueInput(df) SynTexTrainer(data, Style2PO(args), n_gpu).train_with_defaults( callbacks=[ PeriodicTrigger(ModelSaver(), every_k_epochs=save_epoch), PeriodicTrigger(ModelSaver(), every_k_epochs=end_epoch), # save model at last ScheduledHyperParamSetter('learning_rate', [(start_dec_epoch, lr), (max_epoch, 0)], interp="linear"), PeriodicTrigger(VisualizeTestSet(data_folder, image_size), every_k_epochs=max(1, max_epoch // 100)), #MergeAllSummaries(period=scalar_steps), # scalar only, slowdown in training, use TCMalloc MergeAllSummaries(period=image_steps, key="image_summaries"), MergeAllSummaries(key="acti_summaries"), ], max_epoch=end_epoch, steps_per_epoch=steps_per_epoch, session_init=None)
def get_config(model, fake=False): start_ = 0 nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) if fake: dataset_train = FakeData( [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8') callbacks = [] else: dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) if start_ < 31: lr_setting =[(max(30-start_, 0) , BASE_LR * 1e-1), (60 - start_, BASE_LR * 1e-2),( 90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] elif start_ < 61: lr_setting =[(max(60 - start_, 0), BASE_LR * 1e-2),( 90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] elif start_ < 91: lr_setting =[(max(90 - start_, 0), BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] else: print('not found learning rate setting!!!!!!!!!!!!!') callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', lr_setting), # TensorPrinter(['tower1/group3/block2/conv2/Abs_0', 'tower1/group3/block2/conv2/Abs_1:0', 'tower1/group3/block2/conv2/Abs_2:0']) ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=120, )
def __init__(self, config, g_vs_d=1): self._input_method = QueueInput(config.dataset) super(GANTrainer, self).__init__(config) if g_vs_d > 1: self._opt_g = g_vs_d self._opt_d = 1 else: self._opt_g = 1 self._opt_d = int(1.0 / g_vs_d)
def get_config(model, scales, distill=False, fake=False, data_aug=True): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch, data_aug) dataset_val = get_data('val', batch, data_aug) callbacks = [ModelSaver()] if data_aug: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)])) callbacks.append(HumanHyperParamSetter('learning_rate')) infs = [] for scale in scales: infs.append( ClassificationError('wrong-scale%03d-top1' % scale, 'val-error-scale%03d-top1' % scale)) infs.append( ClassificationError('wrong-scale%03d-top5' % scale, 'val-error-scale%03d-top5' % scale)) if distill: infs.append( ClassificationError('wrong-scale_ensemble-top1', 'val-error-scale_ensemble-top1')) infs.append( ClassificationError('wrong-scale_ensemble-top5', 'val-error-scale_ensemble-top5')) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return AutoResumeTrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000 if TOTAL_BATCH_SIZE == 256 else 10000, max_epoch=120 if data_aug else 64, nr_tower=nr_tower)
def fit(self, data): """Fit the model to the given data. Args: data(pandas.DataFrame): dataset to fit the model. Returns: None """ self.preprocessor = Preprocessor( continuous_columns=self.continuous_columns) data = self.preprocessor.fit_transform(data) self.metadata = self.preprocessor.metadata dataflow = TGANDataFlow(data, self.metadata) batch_data = BatchData(dataflow, self.batch_size) input_queue = QueueInput(batch_data) self.model = self.get_model(training=True) if self.trainer == 'GANTrainer': trainer = GANTrainer(model=self.model, input_queue=input_queue) elif self.trainer == 'SeparateGANTrainer': trainer = SeparateGANTrainer(model=self.model, input_queue=input_queue) else: raise ValueError( 'Incorrect trainer name. Use GANTrainer or SeparateGANTrainer') # trainer = SeparateGANTrainer(model=self.model, input_queue=input_queue) self.restore_path = os.path.join(self.model_dir, 'checkpoint') if os.path.isfile(self.restore_path) and self.restore_session: session_init = SaverRestore(self.restore_path) with open(os.path.join(self.log_dir, 'stats.json')) as f: starting_epoch = json.load(f)[-1]['epoch_num'] + 1 else: session_init = None starting_epoch = 1 action = 'k' if self.restore_session else None logger.set_logger_dir(self.log_dir, action=action) callbacks = [] if self.save_checkpoints: callbacks.append(ModelSaver(checkpoint_dir=self.model_dir)) trainer.train_with_defaults(callbacks=callbacks, steps_per_epoch=self.steps_per_epoch, max_epoch=self.max_epoch, session_init=session_init, starting_epoch=starting_epoch) self.prepare_sampling()
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) BASE_LR = 0.1 * (args.batch / 256.0) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (85, BASE_LR * 1e-3), (95, BASE_LR * 1e-4), (105, BASE_LR * 1e-5)]), ] if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear')) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=110, )
def __init__(self, config, d_period=1, g_period=1): """ Args: d_period(int): period of each d_opt run g_period(int): period of each g_opt run """ self._input_source = QueueInput(config.dataflow) self._d_period = int(d_period) self._g_period = int(g_period) assert min(d_period, g_period) == 1 super(SeparateGANTrainer, self).__init__(config)
def get_config(model): batch = 1 logger.info("For benchmark, batch size is fixed to 1 per tower.") data = QueueInput( FakeData([[1, 224, 224, 3], [1]], 1, random=False, dtype='uint8')) return TrainConfig(model=model, data=data, callbacks=[], steps_per_epoch=1, max_epoch=1)
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5), (80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)]), HumanHyperParamSetter('learning_rate'), ] # Finetune COCO #[(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5),(80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)] #JT COCO #[(0, 2.5e-4), (20, 1.25e-4), (40, 5e-5), (60, 2.5e-5),(80, 1e-5), (100, 5e-6), (120, 2.5e-6)] #Fintune to VOC #[(0, 1.25e-4), (20, 5e-5), (40, 2.5e-5), (60,1.25e-5),(80, 5e-6), (100, 2.5e-6), (120, 1.25e-6)] #infs = [ClassificationError('wrong-top1', 'val-error-top1'), # ClassificationError('wrong-top5', 'val-error-top5')] infs = [ ClassificationError('loss-wrong-top1', 'loss-val-error-top1'), ClassificationError('loss-wrong-top5', 'loss-val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=1522, max_epoch=140, nr_tower=nr_tower)
def run(args): df_train = s2b_df(args.train_dir_face, args.train_dir_bitmoji, args.batch_size, args.num_threads) df_test = s2b_df(args.test_dir_face, args.test_dir_bitmoji, args.batch_size, args.num_threads) def update_lr(epoch, cur_lr): """ Approximate exponential decay of the learning rate """ if args.resume_lr: return cur_lr * args.decay else: return args.lr * args.decay**epoch callbacks = [ cb.ModelSaver(), cb.MinSaver('val-error-top1'), cb.HyperParamSetterWithFunc('LR', update_lr), # cb.HyperParamSetterWithFunc('Instance_Noise_Stddev', lambda epoch, stddev: stddev * args.decay), # cb.HyperParamSetterWithFunc('D_Uncertainty_Threshold', lambda epoch, threshold: threshold * args.decay), cb.MergeAllSummaries(period=args.summary_freq), ] infs = [ cb.ScalarStats( ['L_c', 'L_const', 'L_gan_d', 'L_gan_g', 'L_tid', 'L_tv']) ] if get_nr_gpu() > 0: callbacks.append(cb.GPUUtilizationTracker()) callbacks.append(cb.InferenceRunner(QueueInput(df_test), infs)) S2BTrainer(QueueInput(df_train), Selfie2BitmojiModel(args)).train_with_defaults( callbacks=callbacks, max_epoch=args.epochs, steps_per_epoch=df_train.size(), session_init=SaverRestore(args.load_path))
def get_config(model, fake=False, xla=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(10, 1e-2), (20, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) config = tf.ConfigProto() jit_level = 0 if xla: # Turns on XLA JIT compilation jit_level = tf.OptimizerOptions.ON_1 config.graph_options.optimizer_options.global_jit_level = jit_level return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=10, max_epoch=1, nr_tower=nr_tower)
def get_config(args, model, num_gpus, num_towers): """ Create the TensorPack Trainer configuration. :param args: The cli arguments. :param model: The model object to train. :param num_gpus: The number of gpus on which to train :param num_towers: The number of data parallel towers to create :return: A TrainConfig object. """ logger.info("Running on {} towers. Batch size per tower: {}".format( num_towers, args.batch_size)) df_train = avatar_synth_df(args.train_dir, args.batch_size, args.num_threads) df_test = avatar_synth_df(args.test_dir, args.batch_size, args.num_threads) def update_lr(epoch, cur_lr): """ Approximate exponential decay of the learning rate """ if args.resume_lr: return cur_lr * args.lr_decay else: return args.lr * args.lr_decay**epoch callbacks = [ cb.ModelSaver(), cb.MinSaver('val-error-top1'), cb.HyperParamSetterWithFunc('tower0/Avatar_Synth/LR:0', update_lr), cb.MergeAllSummaries(period=args.summary_freq), ] infs = [cb.ScalarStats('Avatar_Synth/Cost')] if num_gpus > 0: callbacks.append(cb.GPUUtilizationTracker()) if num_towers == 1: # single-GPU inference with queue prefetch callbacks.append(cb.InferenceRunner(QueueInput(df_test), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( cb.DataParallelInferenceRunner(df_test, infs, list(range(num_towers)))) return TrainConfig(model=model, dataflow=df_train, callbacks=callbacks, max_epoch=args.epochs, nr_tower=num_towers)
def get_data(self, name, num_gpu): gpu_batch = self.batch_size // num_gpu assert name in ['train', 'val', 'test'] isTrain = name == 'train' augmentors = fbresnet_augmentor(isTrain) assert isinstance(augmentors, list) parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = dataset.ILSVRC12(self.datadir, name, shuffle=True, dir_structure='train') ds = AugmentImageComponent(ds, augmentors, copy=False) ds = MultiProcessRunnerZMQ(ds, parallel) ds = BatchData(ds, gpu_batch, remainder=False) #ds = QueueInput(ds) else: ds = dataset.ILSVRC12Files(self.datadir, name, shuffle=False, dir_structure='train') aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, gpu_batch, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) if num_gpu == 1: ds = QueueInput(ds) return ds
def get_config(model, checkpoint_dir, target_shape, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, target_shape, target_shape, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch, target_shape) dataset_val = get_data('val', batch, target_shape) callbacks = [ ModelSaver(checkpoint_dir=checkpoint_dir), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) # 7.5 it / sec testing return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 300, #5000 max_epoch=110, nr_tower=nr_tower)
def get_config(model): nr_tower = get_nr_gpu() logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, args.batch_size_per_gpu)) dataset_train = get_data('train', args.batch_size_per_gpu) dataset_val = get_data('val', args.batch_size_per_gpu) BASE_LR = 1e-3 * (args.batch_size_per_gpu * nr_tower / 256.0) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR), (60, BASE_LR * 1e-1), (90, BASE_LR * 1e-2)]), HumanHyperParamSetter('learning_rate'), ] ''' if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear')) ''' infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=1280000 // (args.batch_size_per_gpu * nr_tower), max_epoch=110, )
def get_config(model, data_dir, crop_method_TR, color_augmentation, crop_method_TS): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) # data pipelines of train and validation dataset_train = get_data('train', data_dir, batch, crop_method_TR, \ color_augmentation = color_augmentation, CAM_dir_pkl = CAM_DIR_PKL) dataset_val = get_data('val', data_dir, batch, crop_method_TS) # TODO callbacks = [ # class callbacks.ModelSaver(max_to_keep = 10, keep_checkpoint_every_n_hours = 0.5, # checkpoint_dir = None, var_collections = 'variables') ModelSaver(max_to_keep = MAX_EPOCH), # @ 20171129: finetune on ResNet d18 from ImageNet # maybe moderate learning_rate is perferable ScheduledHyperParamSetter('learning_rate', [(0, 1e-3), (20, 5e-4), (40, 1e-4), (60, 1e-5)]), HumanHyperParamSetter('learning_rate'), ] # 0 or 1 infs = [ClassificationError('wrong-top1', 'val-error-top1')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model = model, dataflow = dataset_train, callbacks = callbacks, #steps_per_epoch = 5000, max_epoch = MAX_EPOCH, nr_tower = nr_tower )
def get_config(model, conf): nr_tower = max(get_nr_gpu(), 1) batch = conf.batch if conf.fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data(conf.data_dir, 'train', batch) dataset_val = get_data(conf.data_dir, 'val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(45, 1e-2), (60, 1e-3), (65, 1e-4), (70, 1e-5), (75, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=80, nr_tower=nr_tower)
def __init__(self, config): nr_gpu = config.nr_tower assert nr_gpu > 1 raw_devices = ['/gpu:{}'.format(k) for k in config.tower] # setup input input = StagingInputWrapper(QueueInput(config.dataflow), raw_devices) model = config.model cbs = input.setup(model.get_inputs_desc()) config.callbacks.extend(cbs) def get_cost(): model.build_graph(input) return [model.d_loss, model.g_loss] devices = [ LeastLoadedDeviceSetter(d, raw_devices) for d in raw_devices ] cost_list = MultiGPUTrainerBase.build_on_multi_tower( config.tower, get_cost, devices) # simply average the cost. It might get faster to average the gradients with tf.name_scope('optimize'): d_loss = tf.add_n([x[0] for x in cost_list]) * (1.0 / nr_gpu) g_loss = tf.add_n([x[1] for x in cost_list]) * (1.0 / nr_gpu) opt = model.get_optimizer() # run one d_min after one g_min g_min = opt.minimize(g_loss, var_list=model.g_vars, colocate_gradients_with_ops=True, name='g_op') with tf.control_dependencies([g_min]): d_min = opt.minimize(d_loss, var_list=model.d_vars, colocate_gradients_with_ops=True, name='d_op') self.train_op = d_min super(MultiGPUGANTrainer, self).__init__(config)
def get_config(model, nr_tower): batch = TOTAL_BATCH_SIZE // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) step_size = 1280000 // TOTAL_BATCH_SIZE max_iter = 3 * 10**5 max_epoch = (max_iter // step_size) + 1 callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, 0.5), (max_iter, 0)], interp='linear', step_based=True), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=step_size, max_epoch=max_epoch, )
M.add(KL.Conv2D(32, 3, padding='same', activation='relu')) M.add(KL.Flatten()) M.add( KL.Dense(512, activation='relu', kernel_regularizer=keras.regularizers.l2(1e-5))) M.add(KL.Dropout(0.5)) M.add( KL.Dense(10, activation=None, kernel_regularizer=keras.regularizers.l2(1e-5))) M.add(KL.Activation('softmax')) return M dataset_train, dataset_test = get_data() M = KerasModel(model_func, inputs_desc=[ InputDesc(tf.float32, [None, IMAGE_SIZE, IMAGE_SIZE, 1], 'images') ], targets_desc=[InputDesc(tf.float32, [None, 10], 'labels')], input=QueueInput(dataset_train)) M.compile(optimizer=tf.train.AdamOptimizer(1e-3), loss='categorical_crossentropy', metrics='categorical_accuracy') M.fit(validation_data=dataset_test, steps_per_epoch=dataset_train.size(), callbacks=[ModelSaver()])
scalar_steps = 0 # merge scalar summary every epoch # lr starts decreasing at half of max epoch start_dec_epoch = max_epoch // 2 # stops when lr is 0.01 of its initial value end_epoch = max_epoch - int((max_epoch - start_dec_epoch) * 0.01) # adjust noise input range according to the input act zmin, zmax = (0, 1) if args.get("act") == "identity" else (-1, 1) if save_folder == None: logger.auto_set_dir() else: logger.set_logger_dir(save_folder) df = get_data(data_folder, image_size, zmin=zmin, zmax=zmax) df = PrintData(df) data = QueueInput(df) SynTexTrainer(data, AdaptiveSynTex(args), n_gpu).train_with_defaults( callbacks=[ PeriodicTrigger(ModelSaver(), every_k_epochs=save_epoch), PeriodicTrigger(ModelSaver(), every_k_epochs=end_epoch), # save model at last ScheduledHyperParamSetter('learning_rate', [(start_dec_epoch, lr), (max_epoch, 0)], interp="linear"), #PeriodicTrigger(VisualizeTestSet(data_folder, image_size), every_k_epochs=10), MergeAllSummaries(period=scalar_steps), # scalar only MergeAllSummaries(period=image_steps, key="image_summaries"), ], max_epoch=end_epoch, steps_per_epoch=steps_per_epoch,