def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData( [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=110, nr_tower=nr_tower )
def get_config(model, nr_tower): batch = TOTAL_BATCH_SIZE // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, 3e-1), (30, 3e-2), (60, 3e-3), (90, 3e-4)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=100, )
def get_config(model, fake=False): nr_tower = max(get_num_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) if batch < 32 or batch > 64: logger.warn( "Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported." ) if fake: data = QueueInput( FakeData([[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')) callbacks = [] else: data = QueueInput(get_data('train', batch)) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter('learning_rate', [(0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]), ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] dataset_val = get_data('val', batch) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1281167 // args.batch, max_epoch=105, )
def get_config(model, nr_tower): batch = TOTAL_BATCH_SIZE // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, 3e-1), (30, 3e-2), (60, 3e-3), (90, 3e-4)]), HumanHyperParamSetter('learning_rate'), ] infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=100, )
def get_config(fake=False, data_format='NCHW'): nr_tower = max(get_nr_gpu(), 1) global BATCH_SIZE BATCH_SIZE = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = dataset_val = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, BATCH_SIZE)) dataset_train = get_data('train') dataset_val = get_data('val') return TrainConfig( model=Model(data_format=data_format), dataflow=dataset_train, callbacks=[ ModelSaver(), InferenceRunner(dataset_val, [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ]), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ], steps_per_epoch=5000, max_epoch=110, nr_tower=nr_tower)
def save(self, path, force=False): """Save the fitted model in the given path.""" if os.path.exists(path) and not force: logger.info( 'The indicated path already exists. Use `force=True` to overwrite.' ) return base_path = os.path.dirname(path) if not os.path.exists(base_path): os.makedirs(base_path) model = self.model dataset_predictor = self.simple_dataset_predictor self.model = None self.simple_dataset_predictor = None with open('{}/TGANModel'.format(self.output), 'wb') as f: pickle.dump(self, f) self.model = model self.simple_dataset_predictor = dataset_predictor self.tar_folder(path) logger.info('Model saved successfully.')
def get_config(model, nr_tower): batch = TOTAL_BATCH_SIZE // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) step_size = 1280000 // TOTAL_BATCH_SIZE max_iter = 3 * 10**5 max_epoch = (max_iter // step_size) + 1 callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, 0.5), (max_iter, 0)], interp='linear', step_based=True), ] infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=step_size, max_epoch=max_epoch, )
def convert_param_name(param): print('--> convert_param_name ...') resnet_param = {} for k in param.keys(): logger.info("Load the weights of the module {}".format(k.split(":")[0])) resnet_param[k.split(":")[0]] = param[k] return resnet_param
def get_config(model, fake=False): start_ = 0 nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) if fake: dataset_train = FakeData( [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8') callbacks = [] else: dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) if start_ < 31: lr_setting =[(max(30-start_, 0) , BASE_LR * 1e-1), (60 - start_, BASE_LR * 1e-2),( 90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] elif start_ < 61: lr_setting =[(max(60 - start_, 0), BASE_LR * 1e-2),( 90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] elif start_ < 91: lr_setting =[(max(90 - start_, 0), BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] else: print('not found learning rate setting!!!!!!!!!!!!!') callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', lr_setting), # TensorPrinter(['tower1/group3/block2/conv2/Abs_0', 'tower1/group3/block2/conv2/Abs_1:0', 'tower1/group3/block2/conv2/Abs_2:0']) ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=120, )
def get_config(model, scales, distill=False, fake=False, data_aug=True): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch, data_aug) dataset_val = get_data('val', batch, data_aug) callbacks = [ModelSaver()] if data_aug: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)])) callbacks.append(HumanHyperParamSetter('learning_rate')) infs = [] for scale in scales: infs.append( ClassificationError('wrong-scale%03d-top1' % scale, 'val-error-scale%03d-top1' % scale)) infs.append( ClassificationError('wrong-scale%03d-top5' % scale, 'val-error-scale%03d-top5' % scale)) if distill: infs.append( ClassificationError('wrong-scale_ensemble-top1', 'val-error-scale_ensemble-top1')) infs.append( ClassificationError('wrong-scale_ensemble-top5', 'val-error-scale_ensemble-top5')) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return AutoResumeTrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000 if TOTAL_BATCH_SIZE == 256 else 10000, max_epoch=120 if data_aug else 64, nr_tower=nr_tower)
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) BASE_LR = 0.1 * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter('learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (85, BASE_LR * 1e-3), (95, BASE_LR * 1e-4), (105, BASE_LR * 1e-5)]), ] if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear')) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=110, )
def get_config(model): batch = 1 logger.info("For benchmark, batch size is fixed to 1 per tower.") data = QueueInput( FakeData([[1, 224, 224, 3], [1]], 1, random=False, dtype='uint8')) return TrainConfig(model=model, data=data, callbacks=[], steps_per_epoch=1, max_epoch=1)
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5), (80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)]), HumanHyperParamSetter('learning_rate'), ] # Finetune COCO #[(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5),(80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)] #JT COCO #[(0, 2.5e-4), (20, 1.25e-4), (40, 5e-5), (60, 2.5e-5),(80, 1e-5), (100, 5e-6), (120, 2.5e-6)] #Fintune to VOC #[(0, 1.25e-4), (20, 5e-5), (40, 2.5e-5), (60,1.25e-5),(80, 5e-6), (100, 2.5e-6), (120, 1.25e-6)] #infs = [ClassificationError('wrong-top1', 'val-error-top1'), # ClassificationError('wrong-top5', 'val-error-top5')] infs = [ ClassificationError('loss-wrong-top1', 'loss-val-error-top1'), ClassificationError('loss-wrong-top5', 'loss-val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=1522, max_epoch=140, nr_tower=nr_tower)
def guess_inputs(input_dir): meta_candidates = [] model_candidates = [] for path in os.listdir(input_dir): if path.startswith('graph-') and path.endswith('.meta'): meta_candidates.append(path) if path.startswith('model-') and path.endswith('.index'): modelid = int(path[len('model-'):-len('.index')]) model_candidates.append((path, modelid)) assert len(meta_candidates) meta = sorted(meta_candidates)[-1] if len(meta_candidates) > 1: logger.info("Choosing {} from {} as graph file.".format( meta, meta_candidates)) else: logger.info("Choosing {} as graph file.".format(meta)) assert len(model_candidates) model = sorted(model_candidates, key=lambda x: x[1])[-1][0] if len(model_candidates) > 1: logger.info("Choosing {} from {} as model file.".format( model, [x[0] for x in model_candidates])) else: logger.info("Choosing {} as model file.".format(model)) return os.path.join(input_dir, model), os.path.join(input_dir, meta)
def log_config_info(config): config_keys = sorted(config.keys()) data = [] for k in config_keys: data.append([k, config[k]]) headers = ['config_name', 'content'] table = tabulate(data, headers=headers) logger.info(colored("List of Config Args: \n", 'cyan') + table) # save as json writefile = os.path.join(logger.get_logger_dir(), 'config.json') with open(writefile, 'w') as f: json.dump(config, f)
def get_config(model, fake=False, xla=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(10, 1e-2), (20, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) config = tf.ConfigProto() jit_level = 0 if xla: # Turns on XLA JIT compilation jit_level = tf.OptimizerOptions.ON_1 config.graph_options.optimizer_options.global_jit_level = jit_level return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=10, max_epoch=1, nr_tower=nr_tower)
def get_config(model, fake=False): nr_tower = max(get_num_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) if batch < 32 or batch > 64: logger.warn("Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported.") if fake: data = QueueInput(FakeData( [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')) callbacks = [] else: data = QueueInput(get_data('train', batch)) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', [ (0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]), ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] dataset_val = get_data('val', batch) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1281167 // args.batch, max_epoch=105, )
def get_config(args, model, num_gpus, num_towers): """ Create the TensorPack Trainer configuration. :param args: The cli arguments. :param model: The model object to train. :param num_gpus: The number of gpus on which to train :param num_towers: The number of data parallel towers to create :return: A TrainConfig object. """ logger.info("Running on {} towers. Batch size per tower: {}".format( num_towers, args.batch_size)) df_train = avatar_synth_df(args.train_dir, args.batch_size, args.num_threads) df_test = avatar_synth_df(args.test_dir, args.batch_size, args.num_threads) def update_lr(epoch, cur_lr): """ Approximate exponential decay of the learning rate """ if args.resume_lr: return cur_lr * args.lr_decay else: return args.lr * args.lr_decay**epoch callbacks = [ cb.ModelSaver(), cb.MinSaver('val-error-top1'), cb.HyperParamSetterWithFunc('tower0/Avatar_Synth/LR:0', update_lr), cb.MergeAllSummaries(period=args.summary_freq), ] infs = [cb.ScalarStats('Avatar_Synth/Cost')] if num_gpus > 0: callbacks.append(cb.GPUUtilizationTracker()) if num_towers == 1: # single-GPU inference with queue prefetch callbacks.append(cb.InferenceRunner(QueueInput(df_test), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( cb.DataParallelInferenceRunner(df_test, infs, list(range(num_towers)))) return TrainConfig(model=model, dataflow=df_train, callbacks=callbacks, max_epoch=args.epochs, nr_tower=num_towers)
def get_config(model, checkpoint_dir, target_shape, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, target_shape, target_shape, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch, target_shape) dataset_val = get_data('val', batch, target_shape) callbacks = [ ModelSaver(checkpoint_dir=checkpoint_dir), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) # 7.5 it / sec testing return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 300, #5000 max_epoch=110, nr_tower=nr_tower)
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData( [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) BASE_LR = 0.1 * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]), ] if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear')) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=105, )
def get_config(model): nr_tower = get_nr_gpu() logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, args.batch_size_per_gpu)) dataset_train = get_data('train', args.batch_size_per_gpu) dataset_val = get_data('val', args.batch_size_per_gpu) BASE_LR = 1e-3 * (args.batch_size_per_gpu * nr_tower / 256.0) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR), (60, BASE_LR * 1e-1), (90, BASE_LR * 1e-2)]), HumanHyperParamSetter('learning_rate'), ] ''' if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear')) ''' infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=1280000 // (args.batch_size_per_gpu * nr_tower), max_epoch=110, )
def _restart_episode(self): """ restart current episode """ logger.info("Medical Player restarting episode") self.terminal = [False] * self.agents self.reward = np.zeros((self.agents, )) self.cnt = 0 # counter to limit number of steps per episodes self.num_games.feed(1) self._loc_history = [[(0, ) * self.dims for _ in range(self._history_length)] for _ in range(self.agents)] # list of q-value lists self._qvalues_history = [[(0, ) * self.actions for _ in range(self._history_length)] for _ in range(self.agents)] for i in range(0, self.agents): self.current_episode_score[i].reset() self.new_random_game()
def get_config(model, data_dir, crop_method_TR, color_augmentation, crop_method_TS): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) # data pipelines of train and validation dataset_train = get_data('train', data_dir, batch, crop_method_TR, \ color_augmentation = color_augmentation, CAM_dir_pkl = CAM_DIR_PKL) dataset_val = get_data('val', data_dir, batch, crop_method_TS) # TODO callbacks = [ # class callbacks.ModelSaver(max_to_keep = 10, keep_checkpoint_every_n_hours = 0.5, # checkpoint_dir = None, var_collections = 'variables') ModelSaver(max_to_keep = MAX_EPOCH), # @ 20171129: finetune on ResNet d18 from ImageNet # maybe moderate learning_rate is perferable ScheduledHyperParamSetter('learning_rate', [(0, 1e-3), (20, 5e-4), (40, 1e-4), (60, 1e-5)]), HumanHyperParamSetter('learning_rate'), ] # 0 or 1 infs = [ClassificationError('wrong-top1', 'val-error-top1')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model = model, dataflow = dataset_train, callbacks = callbacks, #steps_per_epoch = 5000, max_epoch = MAX_EPOCH, nr_tower = nr_tower )
def _import_external_ops(message): if "horovod" in message.lower(): logger.info("Importing horovod ...") import horovod.tensorflow # noqa return if "MaxBytesInUse" in message: logger.info("Importing memory_stats ...") from tensorflow.contrib.memory_stats import MaxBytesInUse # noqa return if 'Nccl' in message: logger.info("Importing nccl ...") if TF_version <= (1, 12): try: from tensorflow.contrib.nccl.python.ops.nccl_ops import _validate_and_load_nccl_so except Exception: pass else: _validate_and_load_nccl_so() from tensorflow.contrib.nccl.ops import gen_nccl_ops # noqa else: from tensorflow.python.ops import gen_nccl_ops # noqa return if 'ZMQConnection' in message: import zmq_ops # noqa return logger.error("Unhandled error: " + message)
def get_config(model, conf): nr_tower = max(get_nr_gpu(), 1) batch = conf.batch if conf.fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data(conf.data_dir, 'train', batch) dataset_val = get_data(conf.data_dir, 'val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(45, 1e-2), (60, 1e-3), (65, 1e-4), (70, 1e-5), (75, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=80, nr_tower=nr_tower)
def get_config(model, fake=False, data_aug=True): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData( [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch, data_aug) dataset_val = get_data('val', batch, data_aug) callbacks = [ ModelSaver(), ] if data_aug: callbacks.append(ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)])) callbacks.append(HumanHyperParamSetter('learning_rate')) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return AutoResumeTrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000 if TOTAL_BATCH_SIZE == 256 else 10000, max_epoch=110 if data_aug else 64, nr_tower=nr_tower )
def get_config(model, nr_tower): batch = TOTAL_BATCH_SIZE // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) step_size = 1280000 // TOTAL_BATCH_SIZE max_iter = 3 * 10**5 max_epoch = (max_iter // step_size) + 1 callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, 0.5), (max_iter, 0)], interp='linear', step_based=True), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=step_size, max_epoch=max_epoch, )
def get_config(model, fake=False): nr_tower = max(get_num_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData( [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] steps_per_epoch = 100 else: logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_imagenet_dataflow(args.data, 'train', batch) dataset_val = get_imagenet_dataflow(args.data, 'val', min(64, batch)) steps_per_epoch = 1281167 // args.batch BASE_LR = 0.1 * args.batch / 256.0 logger.info("BASELR: {}".format(BASE_LR)) callbacks = [ ModelSaver(), EstimatedTimeLeft(), GPUUtilizationTracker() ] if not args.cosine_lr: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, BASE_LR), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3)])) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=steps_per_epoch, max_epoch=100, )
input.setup(M.get_inputs_desc()) M.build_graph(input) else: tf.train.import_meta_graph(args.meta) # loading... init = get_model_loader(args.model) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) init.init(sess) # dump ... with sess.as_default(): if args.output.endswith('npy') or args.output.endswith('npz'): varmanip.dump_session_params(args.output) else: var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) var.extend(tf.get_collection(tf.GraphKeys.MODEL_VARIABLES)) gvars = set([k.name for k in tf.global_variables()]) var = [v for v in var if v.name in gvars] var_dict = {} for v in var: name = varmanip.get_savename_from_varname(v.name) var_dict[name] = v logger.info("Variables to dump:") logger.info(", ".join(var_dict.keys())) saver = tf.train.Saver(var_list=var_dict, write_version=tf.train.SaverDef.V2) saver.save(sess, args.output, write_meta_graph=False)
def new_random_game(self): """ load image, set dimensions, randomize start point, init _screen, qvals, calc distance to goal """ self.terminal = [False] * self.agents self.viewer = None # ###################################################################### # ## generate evaluation results from 19 different points # if self.count_points ==0: # print('\n============== new game ===============\n') # # save results # if self.total_loc: # with open(self.csvfile, 'a') as outcsv: # fields= [self.filename, self.cur_dist] # writer = csv.writer(outcsv) # writer.writerow(map(lambda x: x, fields)) # self.total_loc = [] # # sample a new image # self._image, self._target_loc, self.filepath, self.spacing = next(self.sampled_files) # scale = next(self.start_points) # self.count_points +=1 # else: # self.count_points += 1 # logger.info('count_points {}'.format(self.count_points)) # scale = next(self.start_points) # # x = int(scale[0] * self._image.dims[0]) # y = int(scale[1] * self._image.dims[1]) # z = int(scale[2] * self._image.dims[2]) # logger.info('starting point {}-{}-{}'.format(x,y,z)) # ###################################################################### # # sample a new image self._image, self._target_loc, self.filepath, self.spacing = next( self.sampled_files) self.filename = [ os.path.basename(self.filepath[i]) for i in range(self.agents) ] # multiscale (e.g. start with 3 -> 2 -> 1) # scale can be thought of as sampling stride if self.multiscale: ## brain self.action_step = 9 self.xscale = 3 self.yscale = 3 self.zscale = 3 ## cardiac # self.action_step = 6 # self.xscale = 2 # self.yscale = 2 # self.zscale = 2 else: self.action_step = 1 self.xscale = 1 self.yscale = 1 self.zscale = 1 # image volume size self._image_dims = self._image[0].dims ####################################################################### ## select random starting point # add padding to avoid start right on the border of the image if self.task == 'train': skip_thickness = ((int)(self._image_dims[0] / 5), (int)(self._image_dims[1] / 5), (int)(self._image_dims[2] / 5)) else: skip_thickness = (int(self._image_dims[0] / 4), int(self._image_dims[1] / 4), int(self._image_dims[2] / 4)) # TODO: should agents start at the same random points, agents get stuck #x=[self.rng.randint(0 + skip_thickness[0], self._image_dims[0] - skip_thickness[0])] * self.agents #y=[self.rng.randint(0 + skip_thickness[1], self._image_dims[1] - skip_thickness[1])] * self.agents #z=[self.rng.randint(0 + skip_thickness[2], self._image_dims[2] - skip_thickness[2])] * self.agents x = [ self.rng.randint(0 + skip_thickness[0], self._image_dims[0] - skip_thickness[0]) for _ in range(self.agents) ] y = [ self.rng.randint(0 + skip_thickness[1], self._image_dims[1] - skip_thickness[1]) for _ in range(self.agents) ] z = [ self.rng.randint(0 + skip_thickness[2], self._image_dims[2] - skip_thickness[2]) for _ in range(self.agents) ] ####################################################################### self._location = [(x[i], y[i], z[i]) for i in range(self.agents)] self._start_location = [(x[i], y[i], z[i]) for i in range(self.agents)] self._qvalues = [[ 0, ] * self.actions] * self.agents self._screen = self._current_state() if self.task == 'play': self.cur_dist = [ 0, ] * self.agents else: self.cur_dist = [ self.calcDistance(self._location[i], self._target_loc[i], self.spacing) for i in range(self.agents) ] logger.info("Current distance is " + str(self.cur_dist))
choices=[50, 101]) parser.add_argument('--logdir', default='train_log/ResNet-GN') parser.add_argument('--WS', action='store_true', help='Use Weight Standardization') args = parser.parse_args() model = Model() model.depth = args.depth model.use_WS = args.WS if args.eval: batch = 128 # something that can run on one gpu ds = get_imagenet_dataflow(args.data, 'val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: if args.fake: logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd') else: logger.set_logger_dir(args.logdir, 'd') try: from tensorpack.tfutils import collect_env_info logger.info("\n" + collect_env_info()) except Exception: pass config = get_config(model, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_num_gpu(), 1)) launch_train_with_config(config, trainer)
def log_tensor_info(tensors): for t in tensors: logger.info("name: {}, shape: {}".format(t.name, t.get_shape()))
args = parser.parse_args() tf.train.import_meta_graph(args.meta, clear_devices=True) # loading... init = get_model_loader(args.input) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) init.init(sess) # dump ... with sess.as_default(): if args.output.endswith('npy') or args.output.endswith('npz'): varmanip.dump_session_params(args.output) else: var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) var.extend(tf.get_collection(tf.GraphKeys.MODEL_VARIABLES)) gvars = set([k.name for k in tf.global_variables()]) var = [v for v in var if v.name in gvars] var_dict = {} for v in var: name = varmanip.get_savename_from_varname(v.name) var_dict[name] = v logger.info("Variables to dump:") logger.info(", ".join(var_dict.keys())) saver = tf.train.Saver( var_list=var_dict, write_version=tf.train.SaverDef.V2) saver.save(sess, args.output, write_meta_graph=False)