def train(args, logdir1, logdir2): # model model = Net2() preprocessing(data_path, logdir2) # dataflow df = Net2DataFlow(data_path, hp.train2.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir2) # session_conf = tf.ConfigProto( # gpu_options=tf.GPUOptions( # allow_growth=True, # per_process_gpu_memory_fraction=0.6, # ), # ) dataset_size = len(glob.glob(data_path + '/wav/*.wav')) print("\t\data_path : ", data_path) print("\t\tDataset Size : ", dataset_size) print("\t\tBatch Size : ", hp.train2.batch_size) print("\t\tSteps per epoch : ", (dataset_size // hp.train2.batch_size)) from time import sleep sleep(10) session_inits = [] ckpt2 = '{}/{}'.format( logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2) if ckpt2: session_inits.append(SaverRestore(ckpt2)) ckpt1 = tf.train.latest_checkpoint(logdir1) if ckpt1: session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) train_conf = AutoResumeTrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=8)), callbacks=[ # TODO save on prefix net2 ModelSaver(checkpoint_dir=logdir2), # ConvertCallback(logdir2, hp.train2.test_per_epoch), ], max_epoch=hp.train2.num_epochs, steps_per_epoch=dataset_size // hp.train2.batch_size, session_init=ChainInit(session_inits)) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) gpu_list = args.gpu.split(',') gpu_list = list(map(int, gpu_list)) #trainer = SimpleTrainer() trainer = SyncMultiGPUTrainerReplicated(gpu_list) #trainer = AsyncMultiGPUTrainer(gpu_list, False) launch_train_with_config(train_conf, trainer=trainer)
def train(args, logdir): # model model = Net1() preprocessing(data_path) preprocessing(test_path) # dataflow df = Net1DataFlow(data_path, hp.train1.batch_size) df_test = Net1DataFlow(test_path, hp.train1.batch_size) #datas = df.get_data() #print(datas[1]) # set logger for event and model saver logger.set_logger_dir(logdir) #session_conf = tf.ConfigProto( # gpu_options=tf.GPUOptions( # allow_growth=True, # ),) # cv test code # https://github.com/tensorpack/tensorpack/blob/master/examples/boilerplate.py train_conf = AutoResumeTrainConfig( model=model, data=QueueInput(df(n_prefetch=hp.train1.batch_size * 10, n_thread=1)), callbacks=[ ModelSaver(checkpoint_dir=logdir), InferenceRunner( df_test(n_prefetch=1), ScalarStats(['net1/eval/loss', 'net1/eval/acc'], prefix='')), ], max_epoch=hp.train1.num_epochs, steps_per_epoch=hp.train1.steps_per_epoch, #session_config=session_conf ) ckpt = '{}/{}'.format( logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) num_gpu = hp.train1.num_gpu if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) num_gpu = len(args.gpu.split(',')) trainer = SyncMultiGPUTrainerReplicated(num_gpu) else: trainer = SimpleTrainer() launch_train_with_config(train_conf, trainer=trainer)
def get_config(model, fake=False): nr_tower = max(get_num_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) if batch < 32 or batch > 64: logger.warn( "Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported." ) if fake: data = QueueInput( FakeData( [[batch, 224, 224, 3], [batch], [batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')) callbacks = [] else: data = QueueInput(get_data('train', batch)) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter('learning_rate', [(0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (45, BASE_LR * 1e-2), (55, BASE_LR * 1e-3)]), ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5') ] dataset_val = get_data('val', batch) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return AutoResumeTrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=60, )
def get_config(model, scales, distill=False, fake=False, data_aug=True): nr_tower = max(get_nr_gpu(), 1) batch = TOTAL_BATCH_SIZE // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData([[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] else: logger.info("Running on {} towers. Batch size per tower: {}".format( nr_tower, batch)) dataset_train = get_data('train', batch, data_aug) dataset_val = get_data('val', batch, data_aug) callbacks = [ModelSaver()] if data_aug: callbacks.append( ScheduledHyperParamSetter('learning_rate', [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)])) callbacks.append(HumanHyperParamSetter('learning_rate')) infs = [] for scale in scales: infs.append( ClassificationError('wrong-scale%03d-top1' % scale, 'val-error-scale%03d-top1' % scale)) infs.append( ClassificationError('wrong-scale%03d-top5' % scale, 'val-error-scale%03d-top5' % scale)) if distill: infs.append( ClassificationError('wrong-scale_ensemble-top1', 'val-error-scale_ensemble-top1')) infs.append( ClassificationError('wrong-scale_ensemble-top5', 'val-error-scale_ensemble-top5')) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append( DataParallelInferenceRunner(dataset_val, infs, list(range(nr_tower)))) return AutoResumeTrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=5000 if TOTAL_BATCH_SIZE == 256 else 10000, max_epoch=120 if data_aug else 64, nr_tower=nr_tower)
def train(args, logdir): # model model = Net1() # dataflow TIMIT_TRAIN_WAV = 'TIMIT/TRAIN/*/*/*.npz' TIMIT_TEST_WAV = 'TIMIT/TEST/*/*/*.npz' print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV)) print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV)) df = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV), hp.train1.batch_size) df_test = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV), hp.train1.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir) train_conf = AutoResumeTrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=8)), callbacks=[ ModelSaver(checkpoint_dir=logdir), InferenceRunner(df_test(n_prefetch=1), ScalarStats(['net1/eval/loss', 'net1/eval/acc'],prefix='')), ], max_epoch=hp.train1.num_epochs, steps_per_epoch=hp.train1.steps_per_epoch, #session_config=session_conf ) ckpt = '{}/{}'.format(logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if hp.default.use_gpu == True: os.environ['CUDA_VISIBLE_DEVICES'] = hp.default.gpu_list train_conf.nr_tower = len(hp.default.gpu_list.split(',')) num_gpu = len(hp.default.gpu_list.split(',')) trainer = SyncMultiGPUTrainerReplicated(num_gpu) else: os.environ['CUDA_VISIBLE_DEVICES'] = '' trainer = SimpleTrainer() launch_train_with_config(train_conf, trainer=trainer)