def get_config(model, fake=False): nr_tower = max(get_num_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) if batch < 32 or batch > 64: logger.warn( "Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported." ) if fake: data = QueueInput( FakeData([[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')) callbacks = [] else: data = QueueInput(get_data('train', batch)) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter('learning_rate', [(0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]), ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] dataset_val = get_data('val', batch) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1281167 // args.batch, max_epoch=105, )
def get_config(fake=False, data_format='NCHW'): nr_tower = max(get_nr_gpu(), 1) global BATCH_SIZE BATCH_SIZE = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = dataset_val = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, BATCH_SIZE)) dataset_train = get_data('train') dataset_val = get_data('val') return TrainConfig( model=Model(data_format=data_format), dataflow=dataset_train, callbacks=[ ModelSaver(), InferenceRunner(dataset_val, [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ]), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ], steps_per_epoch=5000, max_epoch=110, nr_tower=nr_tower)
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData( [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000, max_epoch=110, nr_tower=nr_tower )
def get_config(model, fake=False): start_ = 0 nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) if fake: dataset_train = FakeData( [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8') callbacks = [] else: dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) if start_ < 31: lr_setting =[(max(30-start_, 0) , BASE_LR * 1e-1), (60 - start_, BASE_LR * 1e-2),( 90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] elif start_ < 61: lr_setting =[(max(60 - start_, 0), BASE_LR * 1e-2),( 90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] elif start_ < 91: lr_setting =[(max(90 - start_, 0), BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] else: print('not found learning rate setting!!!!!!!!!!!!!') callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', lr_setting), # TensorPrinter(['tower1/group3/block2/conv2/Abs_0', 'tower1/group3/block2/conv2/Abs_1:0', 'tower1/group3/block2/conv2/Abs_2:0']) ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=120, )
def get_config(model): nr_tower = max(get_num_gpu(), 1) batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) callbacks = [ThroughputTracker(args.batch)] if args.fake: data = QueueInput(FakeData( [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')) else: data = QueueInput( get_imagenet_dataflow(args.data, 'train', batch), # use a larger queue queue=tf.FIFOQueue(200, [tf.uint8, tf.int32], [[batch, 224, 224, 3], [batch]]) ) BASE_LR = 30 SCALED_LR = BASE_LR * (args.batch / 256.0) callbacks.extend([ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', [ (0, SCALED_LR), (60, SCALED_LR * 1e-1), (70, SCALED_LR * 1e-2), (80, SCALED_LR * 1e-3), (90, SCALED_LR * 1e-4), ]), ]) dataset_val = get_imagenet_dataflow(args.data, 'val', 64) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) if args.load.endswith(".npz"): # a released model in npz format init = SmartInit(args.load) else: # a pre-trained checkpoint init = SaverRestore(args.load, ignore=("learning_rate", "global_step")) return TrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1281167 // args.batch, session_init=init, max_epoch=100, )
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) BASE_LR = 0.1 * (args.batch / 256.0) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (85, BASE_LR * 1e-3), (95, BASE_LR * 1e-4), (105, BASE_LR * 1e-5)]), ] if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear')) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=110, )
def get_config(model): batch = 1 logger.info("For benchmark, batch size is fixed to 1 per tower.") data = QueueInput( FakeData([[1, 224, 224, 3], [1]], 1, random=False, dtype='uint8')) return TrainConfig(model=model, data=data, callbacks=[], steps_per_epoch=1, max_epoch=1)
def get_config(model, fake=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5), (80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)]), HumanHyperParamSetter('learning_rate'), ] # Finetune COCO #[(0, 5e-4), (20, 2.5e-4), (40, 1.25e-4), (60, 5e-5),(80, 2.5e-5), (100, 1.25e-5), (120, 5e-6)] #JT COCO #[(0, 2.5e-4), (20, 1.25e-4), (40, 5e-5), (60, 2.5e-5),(80, 1e-5), (100, 5e-6), (120, 2.5e-6)] #Fintune to VOC #[(0, 1.25e-4), (20, 5e-5), (40, 2.5e-5), (60,1.25e-5),(80, 5e-6), (100, 2.5e-6), (120, 1.25e-6)] #infs = [ClassificationError('wrong-top1', 'val-error-top1'), # ClassificationError('wrong-top5', 'val-error-top5')] infs = [ ClassificationError('loss-wrong-top1', 'loss-val-error-top1'), ClassificationError('loss-wrong-top5', 'loss-val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=1522, max_epoch=140, nr_tower=nr_tower)
def get_config(model): input_sig = model.get_input_signature() nr_tower = max(hvd.size(), 1) batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) callbacks = [ThroughputTracker(args.batch), UpdateMomentumEncoder()] if args.fake: data = QueueInput( FakeData([x.shape for x in input_sig], 1000, random=False, dtype='uint8')) else: zmq_addr = 'ipc://@imagenet-train-b{}'.format(batch) data = ZMQInput(zmq_addr, 25, bind=False) dataset = data.to_dataset(input_sig).repeat().prefetch(15) dataset = dataset.apply( tf.data.experimental.prefetch_to_device('/gpu:0')) data = TFDatasetInput(dataset) callbacks.extend([ ModelSaver(), EstimatedTimeLeft(), ]) if not args.v2: # step-wise LR in v1 SCALED_LR = BASE_LR * (args.batch / 256.0) callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, min(BASE_LR, SCALED_LR)), (120, SCALED_LR * 1e-1), (160, SCALED_LR * 1e-2)])) if SCALED_LR > BASE_LR: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, BASE_LR), (5, SCALED_LR)], interp='linear')) return TrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1281167 // args.batch, max_epoch=200, )
def get_config(model, fake=False, xla=False): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) callbacks = [ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(10, 1e-2), (20, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]), HumanHyperParamSetter('learning_rate'), ] infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) config = tf.ConfigProto() jit_level = 0 if xla: # Turns on XLA JIT compilation jit_level = tf.OptimizerOptions.ON_1 config.graph_options.optimizer_options.global_jit_level = jit_level return TrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=10, max_epoch=1, nr_tower=nr_tower)
parser = argparse.ArgumentParser() parser.add_argument('--data', help='ILSVRC dataset dir') parser.add_argument('--fake', action='store_true') parser.add_argument('--batch', help='per-GPU batch size', default=32, type=int) parser.add_argument('--benchmark', action='store_true') parser.add_argument('--no-zmq-ops', action='store_true') args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = '' if args.fake: ds = FakeData([[args.batch, 224, 224, 3], [args.batch]], 1000, random=False, dtype=['uint8', 'int32']) else: augs = fbresnet_augmentor(True) ds = get_data(args.batch, augs) logger.info("Serving data on {}".format(socket.gethostname())) if args.benchmark: from zmq_ops import dump_arrays ds = MapData(ds, dump_arrays) TestDataSpeed(ds, warmup=300).start() else: format = None if args.no_zmq_ops else 'zmq_ops' send_dataflow_zmq(ds, 'ipc://@imagenet-train-b{}'.format(args.batch),
def get_config(model, fake=False): start_ = 0 # nr_tower GPU的数量 nr_tower = max(get_nr_gpu(), 1) assert args.batch % nr_tower == 0 # 每块GPU上的batch batch = args.batch // nr_tower # 加载日志,在第几块GPU上运行,batch是多少。 logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) if fake: # FakeData参数说明 # shape(列表):列表/元组的列表。每个组件的形状。 # size(int):此数据流的大小。 # random(bool):每次迭代是否随机生成数据。 请注意,仅仅生成数据有时可能会非常耗时! # dtype(str或list):数据类型为string,或数据类型列表。 # 这里是使用fakedata测试或基准测试此模型 dataset_train = FakeData([[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8') callbacks = [] # 该语句可能是为了让没有训练数据的时候也能产生一些伪数据 else: dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) START_LR = 0.1 # 开始的学习率 BASE_LR = START_LR * (args.batch / 256.0) # 基础学习率? if start_ < 31: lr_setting = [(max(30 - start_, 0), BASE_LR * 1e-1), (60 - start_, BASE_LR * 1e-2), (90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] elif start_ < 61: lr_setting = [(max(60 - start_, 0), BASE_LR * 1e-2), (90 - start_, BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] elif start_ < 91: lr_setting = [(max(90 - start_, 0), BASE_LR * 1e-3), (105 - start_, BASE_LR * 1e-4)] else: print('not found learning rate setting!!!!!!!!!!!!!') # callback包含保存模型,估算时间,通过预定义的基于时间的计划设置超参数等 callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter('learning_rate', lr_setting), # TensorPrinter(['tower1/group3/block2/conv2/Abs_0', 'tower1/group3/block2/conv2/Abs_1:0', 'tower1/group3/block2/conv2/Abs_2:0']) ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] # 根据GPU的数量取出数据流 if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=120, ) # 返回参数 模型,数据流,callbacks(存放模型,估算时间等),步长
parser.add_argument('--fake', action='store_true') parser.add_argument('--batch', help='per-GPU batch size', default=32, type=int) parser.add_argument('--warmup', help='prefetch buffer size', default=150, type=int) parser.add_argument('--port', help='server port', default=1000, type=int) parser.add_argument('--benchmark', action='store_true') parser.add_argument('--no-zmq-ops', action='store_true') args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = '' if args.fake: ds = FakeData( [[args.batch, args.image_size, args.image_size, 3], [args.batch]], 1000, random=False, dtype=['uint8', 'int32']) else: augs = fbresnet_augmentor(True, image_size=args.image_size) ds = get_data(args.batch, augs, args.worker) logger.info("Serving data on {}".format(socket.gethostname())) if args.benchmark: from zmq_ops import dump_arrays ds = MapData(ds, dump_arrays) TestDataSpeed(ds, warmup=300).start() else: format = None if args.no_zmq_ops else 'zmq_ops' send_dataflow_zmq( ds, 'ipc://@imagenet-train-b{}-p{}'.format(args.batch, args.port),
parser.add_argument('--data', help='ILSVRC dataset dir') parser.add_argument('--fake', action='store_true') parser.add_argument('--batch', help='per-GPU batch size', default=32, type=int) parser.add_argument('--benchmark', action='store_true') parser.add_argument('--v2', action='store_true') parser.add_argument('--no-zmq-ops', action='store_true') args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = '' if args.fake: ds = FakeData([[args.batch, 224, 224, 3], [args.batch, 224, 224, 3]], 9999999, random=False, dtype=['uint8', 'uint8']) else: aug = get_moco_v2_augmentor() if args.v2 else get_moco_v1_augmentor() ds = get_moco_dataflow(args.data, args.batch, aug) logger.info("Serving data on {}".format(socket.gethostname())) if args.benchmark: ds = MapData(ds, dump_arrays) TestDataSpeed(ds, size=99999, warmup=300).start() else: format = None if args.no_zmq_ops else 'zmq_ops' send_dataflow_zmq(ds, 'ipc://@imagenet-train-b{}'.format(args.batch), hwm=200,