def train_vqvae(params, dataset, checkpoint_dir): logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] image_shape = model_params['image_shape'] train_ds, val_ds, sample_train, sample_test = load_toy_dataset( dataset, trainer_params['batch_size'], trainer_params['num_parallel']) params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BaseVQVAE.from_params(model_params) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ Reconstruct(model, sample_train, sample_test, os.path.join(checkpoint_dir, 'images')), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), InferenceRunner(input=val_ds, infs=ScalarStats(['loss', 'perplexity'])), MaxSaver(monitor_stat='validation_loss'), CompressResource(os.path.join(checkpoint_dir, 'images'), os.path.join(checkpoint_dir, 'images.zip')) ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, SimpleTrainer())
def train_pixelcnn_prior(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds, val_ds, sample_train, sample_val, sample_train_label, \ sample_val_label = get_dataflow( dataset_params['path'], False, dataset_params['train_val_split'], trainer_params['batch_size'], trainer_params['num_parallel']) vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path'] vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0], 'config.json') model_params['vqvae_model_params'] = vqvae_config_path latent_shape = model_params['latent_shape'] num_labels = model_params['num_labels'] params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BasePixelCNNPrior.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ SequentialSampling(trainer_params['num_examples_to_generate'], latent_shape, num_labels, model, os.path.join(checkpoint_dir, 'images')), Reconstruct(model, sample_train, sample_val, os.path.join(checkpoint_dir, 'images'), sample_train_label, sample_val_label), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), InferenceRunner(input=val_ds, infs=ScalarStats(['loss'])), MinSaver(monitor_stat='validation_loss'), CompressResource(os.path.join(checkpoint_dir, 'images'), os.path.join(checkpoint_dir, 'images.zip')), RestoreWeights(vqvae_checkpoint_path), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def run_once(self, opt, sess_init=None, save_dir=None): #### train_datagen = self.get_datagen(opt['train_batch_size'], mode='train') valid_datagen = self.get_datagen(opt['infer_batch_size'], mode='valid') ###### must be called before ModelSaver if save_dir is None: logger.set_logger_dir(self.save_dir) else: logger.set_logger_dir(save_dir) ###### model_flags = opt['model_flags'] model = self.get_model()(**model_flags) ###### callbacks = [ ModelSaver(max_to_keep=1, keep_checkpoint_every_n_hours=None), ] for param_name, param_info in opt['manual_parameters'].items(): model.add_manual_variable(param_name, param_info[0]) callbacks.append( ScheduledHyperParamSetter(param_name, param_info[1])) # multi-GPU inference (with mandatory queue prefetch) infs = [StatCollector()] callbacks.append( DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus)))) if self.model_mode == 'seg_gland': callbacks.append(MaxSaver('valid_dice_obj')) elif self.model_mode == 'seg_nuc': callbacks.append(MaxSaver('valid_dice_np')) else: callbacks.append(MaxSaver('valid_auc')) ###### steps_per_epoch = train_datagen.size() // nr_gpus config = TrainConfig( model=model, callbacks=callbacks, dataflow=train_datagen, steps_per_epoch=steps_per_epoch, max_epoch=opt['nr_epochs'], ) config.session_init = sess_init launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus)) tf.reset_default_graph( ) # remove the entire graph in case of multiple runs return
def run(args): num_gpus = get_nr_gpu() num_towers = max(num_gpus, 1) config = get_config(args, AvatarSynthModel(args), num_gpus, num_towers) if args.load_path: config.session_init = SaverRestore(args.load_path) # trainer = SyncMultiGPUTrainerParameterServer(num_towers) # trainer = QueueInputTrainer() trainer = SyncMultiGPUTrainerReplicated(num_towers) launch_train_with_config(config, trainer)
def train_image_embedding_softmax(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds, val_ds, _, _, _, _ = get_dataflow( dataset_params['path'], False, dataset_params['train_val_split'], trainer_params['batch_size'], trainer_params['num_parallel']) vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path'] vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0], 'config.json') model_params['vqvae_model_params'] = vqvae_config_path params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BaseImageEmbedding.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ InferenceRunner(input=val_ds, infs=[ ScalarStats('loss'), ClassificationError('correct_prediction', 'val-correct_prediction')]), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), MinSaver(monitor_stat='val-correct_prediction'), RestoreWeights(vqvae_checkpoint_path), SendStat('Training status', [ 'loss', 'accuracy', 'validation_loss', 'val-correct_prediction'], after_every=2), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def run_once(self, opt, sess_init=None, save_dir=None): #### train_datagen = self.get_datagen(opt["train_batch_size"], mode="train") valid_datagen = self.get_datagen(opt["infer_batch_size"], mode="valid") ###### must be called before ModelSaver if save_dir is None: logger.set_logger_dir(self.save_dir) else: logger.set_logger_dir(save_dir) ###### model_flags = opt["model_flags"] model = self.get_model()(**model_flags) ###### callbacks = [ # ModelSaver(max_to_keep=20), # TODO dynamic this ModelSaver(max_to_keep=opt["nr_epochs"]), # InjectShell(file='/tools/hover_net/src/config.yml', shell='ipython'), ] for param_name, param_info in opt["manual_parameters"].items(): model.add_manual_variable(param_name, param_info[0]) callbacks.append(ScheduledHyperParamSetter(param_name, param_info[1])) # multi-GPU inference (with mandatory queue prefetch) infs = [StatCollector()] callbacks.append( DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus))) ) callbacks.append(MaxSaver("valid_dice")) ###### steps_per_epoch = train_datagen.size() // nr_gpus config = TrainConfig( model=model, callbacks=callbacks, dataflow=train_datagen, steps_per_epoch=steps_per_epoch, max_epoch=opt["nr_epochs"], ) config.session_init = sess_init launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus)) tf.reset_default_graph() # remove the entire graph in case of multiple runs # TODO: save return
def run(model): instance = Model(model, model.conf.data_format) if not model.conf.is_train: batch = 64 dataset = get_data(model.conf.data_dir, 'val', batch) eval_on_ILSVRC12( instance, get_model_loader(model.conf.logdir + '/' + model.conf.test_step), dataset) else: logger.set_logger_dir(os.path.join(model.conf.logdir)) config = get_config(instance, model.conf) if model.conf.reload_step: config.session_init = get_model_loader(model.conf.logdir + '/' + model.conf.reload_step) trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
def train_vae(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds, val_ds, sample_train, sample_val, _, _ = \ get_dataflow(dataset_params['path'], dataset_params['binarizer'], dataset_params['train_val_split'], trainer_params['batch_size'], trainer_params['num_parallel']) params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) latent_dim = model_params['latent_dim'] model = BaseVAE.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ Sampling(model, trainer_params['num_examples_to_generate'], latent_dim, os.path.join(checkpoint_dir, 'images')), Reconstruct(model, sample_train, sample_val, os.path.join(checkpoint_dir, 'images')), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), InferenceRunner(input=val_ds, infs=ScalarStats(['avg_logpx_z', 'neg_elbo'])), MinSaver(monitor_stat='validation_neg_elbo'), CompressResource(os.path.join(checkpoint_dir, 'images'), os.path.join(checkpoint_dir, 'images.zip')), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def run_once(self, nr_gpus, freeze, sess_init=None, save_dir=None): #### train_datagen = self.get_datagen(mode='train') valid_datagen = self.get_datagen(mode='valid') ###### must be called before ModelSaver if save_dir is None: logger.set_logger_dir(self.save_dir) else: logger.set_logger_dir(save_dir) callbacks = [ ModelSaver(max_to_keep=200), ScheduledHyperParamSetter('learning_rate', self.lr_sched), ] ###### # multi-GPU inference (with mandatory queue prefetch) infs = [StatCollector()] callbacks.append( DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus)))) ###### steps_per_epoch = train_datagen.size() // nr_gpus MODEL_MAKER = Model_NP_XY if self.model_mode == 'np+xy' else Model_NP_DIST config = TrainConfig( model=MODEL_MAKER(freeze), callbacks=callbacks, dataflow=train_datagen, steps_per_epoch=steps_per_epoch, max_epoch=self.nr_epochs, ) config.session_init = sess_init launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus)) tf.reset_default_graph( ) # remove the entire graph in case of multiple runs return
def train_image_embedding_triplet(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds = get_triplet_dataflow( dataset_params['path'], trainer_params['items_per_batch'], trainer_params['images_per_item'], trainer_params['num_parallel']) vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path'] vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0], 'config.json') model_params['vqvae_model_params'] = vqvae_config_path params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BaseImageEmbedding.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), MinSaver(monitor_stat='loss'), RestoreWeights(vqvae_checkpoint_path), SendStat('Training status', ['loss', 'pos_triplet_frac'], after_every=2), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def train(self, args): self.args = args # Make sure the save path exist if not os.path.exists(self.args.save): os.makedirs(self.args.save) with change_gpu(self.args.gpu): train_df = self._dataflow() trainer = (SimpleTrainer() if get_num_gpu() <= 1 else SyncMultiGPUTrainerReplicated(get_num_gpu())) print("Found %d gpus. Using trainer:" % get_num_gpu(), trainer) # Setup callbacks self._default_callbacks() try: launch_train_with_config( self.pred_config(self.args, train_df, self.callbacks), trainer) except Exception as error: traceback.print_exc() else: # If everythin worked save a compated model self.export(os.path.join(self.args.save, "compact.pb"))
def critic_train(ctrl, data, log_dir, model_dir, prev_dir, vs_name, split_train_val=False): if not os.path.exists(model_dir): os.makedirs(model_dir) lr_schedule = [] max_epoch = ctrl.critic_train_epoch lr = ctrl.critic_init_lr for epoch in range(0, max_epoch): if epoch % 1 == 0: lr_schedule.append((epoch + 1, lr)) lr *= 0.9 ds_size = len(data[0]) idxs = list(range(ds_size)) np.random.shuffle(idxs) if split_train_val: train_size = ds_size * 9 // 10 if train_size == 0: train_size = ds_size val_start = train_size else: train_size = ds_size val_start = ds_size * 9 // 10 if ds_size - val_start == 0: val_start = 0 data_train = [[col[k] for k in idxs[:train_size]] for col in data] data_val = [[col[k] for k in idxs[val_start:]] for col in data] model = critic_factory(ctrl, is_train=True, vs_name=vs_name) ds_train = critic_dataflow_factory(ctrl, data_train, is_train=True) ds_val = critic_dataflow_factory(ctrl, data_val, is_train=False) session_config = None device = 0 if ctrl.critic_type == CriticTypes.LSTM: session_config = tf.ConfigProto(device_count={'GPU': 0}) device = -1 extra_callbacks = DEFAULT_CALLBACKS() extra_callbacks = list( filter(lambda x: not isinstance(x, ProgressBar), extra_callbacks)) logger.info("Extra callbacks are {}".format( list(map(lambda x: x.__class__, extra_callbacks)))) # Put this into callbacks for in-training validation/inferencing inference_callback = InferenceRunner( ds_val, [ScalarStats('{}/cost'.format(vs_name))], device=device) config = TrainConfig( dataflow=ds_train, callbacks=[ ModelSaver(checkpoint_dir=model_dir, max_to_keep=1, keep_checkpoint_every_n_hours=100), ScheduledHyperParamSetter('learning_rate', lr_schedule) ], extra_callbacks=extra_callbacks, model=model, monitors=[JSONWriter(), ScalarPrinter()], #, TFEventWriter()], steps_per_epoch=ds_train.size(), max_epoch=max_epoch, session_config=session_config) ckpt = tf.train.latest_checkpoint(prev_dir if prev_dir else model_dir) if ckpt: config.session_init = SaverRestore(ckpt) launch_train_with_config(config, SimpleTrainer())
def train(args, cfg): out_dirs = gen_outdirs(args, "tp") output_dir, out_res_dir = out_dirs["output_dir"], out_dirs["out_res_dir"] df = PneuSegDF(args.mode, out_res_dir, args.train_dir, args.testset_dir, args.min_num_workers, cfg) num_gpu = max(get_num_gpu(), 1) ds = df.prepared(num_gpu, cfg.batch_size) # Avoid overwritting config file if os.path.exists(pj(output_dir, os.path.basename(args.config))): input( "Config file will NOT be overwritten. Press Enter to continue...") else: shutil.copy(args.config, output_dir) logger.set_logger_dir(pj(output_dir, "log")) callback_list = [ # PeriodicCallback overwritten the frequency of what's wrapped PeriodicCallback(ModelSaver(50, checkpoint_dir=output_dir), every_k_epochs=1), GPUUtilizationTracker(), MergeAllSummaries(1 if args.train_debug else 0), # ProgressBar(["Loss"]) ] if cfg.network["norm_layer"] == "BN_layers": callback_list.append(BN_layers_update()) if cfg.lr_schedule["type"] == "epoch_wise_constant": schedule = [(ep, lr / num_gpu) for ep, lr in zip( [0] + cfg.lr_schedule["epoch_to_drop_lr"], cfg.lr_schedule["lr"])] callback_list.append( ScheduledHyperParamSetter("learning_rate", schedule)) elif cfg.lr_schedule["type"] == "halved": schedule = [(0, cfg.lr_schedule["init_lr"])] for i in range(cfg.lr_schedule["first_epoch2drop"], cfg.max_epoch, cfg.lr_schedule["period"]): schedule.append( (i, schedule[int((i - cfg.lr_schedule["first_epoch2drop"]) / cfg.lr_schedule["period"])][1] / (cfg.lr_schedule["decay_rate"] * num_gpu))) print(schedule) callback_list.append( ScheduledHyperParamSetter("learning_rate", schedule)) steps_per_epoch = len(ds) // num_gpu + 1 train_cfg = TrainConfig( model=Tensorpack_model(cfg, steps_per_epoch), data=QueueInput(ds), steps_per_epoch=steps_per_epoch, callbacks=callback_list, monitors=[ # ScalarPrinter(True, whitelist=["Loss", "LR"]), ScalarPrinter(True), # ScalarPrinter(), TFEventWriter(), # JSONWriter() ], max_epoch=cfg.max_epoch, session_init=SmartInit(args.resume), starting_epoch=args.resume_epoch) launch_train_with_config( train_cfg, SyncMultiGPUTrainerReplicated(num_gpu) if num_gpu > 1 else SimpleTrainer())
def train_child(model_cls, args, log_dir, child_dir, prev_dir): """ """ if not os.path.exists(child_dir): os.mkdir(child_dir) if os.path.basename(child_dir) == "0" and args.use_init_model: init_model_dir = os.path.join(args.data_dir, 'init_model', args.ds_name) if os.path.exists(init_model_dir): # This implies that there exists init_model_dir, and we are in first model # so we do not need to train. Copy the model and mark finished logger.info("Skip first model as this model is fully trained.") cmd = "mkdir -p {cdir} ; cp {pdir}/* {cdir}/ ".format(\ cdir=child_dir, pdir=args.init_model_dir) _ = subprocess.check_output(cmd, shell=True) return # get training params for train-config (model, args, starting_epoch, lr_schedule, ds_train, insrc_train, train_cbs) = get_training_params(model_cls, args) ## Model callbacks # loss weight update ls_cbs_func = getattr(model, 'compute_loss_select_callbacks', None) if callable(ls_cbs_func): train_cbs.extend(ls_cbs_func()) # extra callback for general logging/ update. extra_callbacks = DEFAULT_CALLBACKS() if not args.do_remote_child_inf_runner: extra_callbacks = \ [ecb for ecb in extra_callbacks if not isinstance(ecb, ProgressBar)] logger.info("Extra callbacks are {}".format( [ecb.__class__ for ecb in extra_callbacks])) # Logging for analysis model_str = model.net_info.to_str() logger.info('LayerInfoListString is :\n {}'.format(model_str)) train_callbacks = [ ModelSaver(checkpoint_dir=child_dir, max_to_keep=1, keep_checkpoint_every_n_hours=100), ] + train_cbs if lr_schedule: train_callbacks.append( ScheduledHyperParamSetter('learning_rate', lr_schedule)) logger.info('The updated params for training is \n{}'.format(args)) config = TrainConfig( data=insrc_train, dataflow=ds_train, callbacks=train_callbacks, extra_callbacks=extra_callbacks, model=model, monitors=[JSONWriter(), ScalarPrinter()], #, TFEventWriter()], steps_per_epoch=args.steps_per_epoch, max_epoch=args.max_epoch, starting_epoch=starting_epoch) for dn in [child_dir, prev_dir]: if dn is None: continue ckpt = tf.train.latest_checkpoint(dn) if ckpt: if args.search_cat_based: restore_cls = SaverRestoreSizeRelaxed else: restore_cls = SaverRestore _ignore = [DYNAMIC_WEIGHTS_NAME] _sess_init_load = restore_cls(ckpt, ignore=_ignore) if dn == child_dir: # loading from self keep global step config.session_init = _sess_init_load else: # loading from others. Set global_step to 0 config.session_init = ChainInit([ _sess_init_load, AssignGlobalStep(0), ]) break launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(args.nr_gpu)) return model
# 默认是Namespace(batch=256, data=None, data_format='NCHW', depth=50, eval=False, fake=False, gpu=None, load=None, log_dir='', mode='resnet') # 有GPU就用os.environ获得系统gpu信息 if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # 根据深度和模式生成一个残差网络,# model的输出方式默认为 NCHW model = Model( args.depth, args.mode ) # 前面的class Model(ImageNetModel) def __init__(self, depth, mode='resnet') model.data_format = args.data_format # 如果是测试的话(eval),输出错误率 if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) # 输出错误率 # 如果是训练的话,记录日志并开始学习 else: if args.fake: # 如果使用fakedata测试或基准测试此模型 logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd') else: log_foder = '/data0/wangguangrun/log_acnt/imagenet-resnet-%s' % ( args.log_dir) logger.set_logger_dir(os.path.join(log_foder)) # 保存路径? config = get_config(model, fake=args.fake) # 得到参数 # 如果要加载模型的话 if args.load: config.session_init = get_model_loader(args.load) # 所有GPU一起开始训练 trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1)) launch_train_with_config( config, trainer) # 最终训练的时候用到了get_optimizer 和 build_graph
type=str, default='NCHW') parser.add_argument('-d', '--depth', help='resnet depth', type=int, default=18, choices=[18, 34, 50, 101, 152]) parser.add_argument('--eval', action='store_true') parser.add_argument('--batch', default=256, type=int, help='total batch size. 32 per GPU gives best accuracy, higher values should be similarly good') parser.add_argument('--mode', choices=['resnet', 'preact', 'se'], help='variants of resnet to use', default='resnet') args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu model = Model(args.depth, args.data_format, args.mode) if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: if args.fake: logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd') else: logger.set_logger_dir( os.path.join('train_log', 'imagenet-{}-d{}'.format(args.mode, args.depth))) config = get_config(model, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
train_config=session_config, load_model=args.load) #print(tf.test.is_gpu_available()) #print(get_available_gpus()) print("Net configured") if args.load: print(">>>> Loading stored model parameters.") # example args.load '/path/to/model/folder/model-xxxx' config.session_init = SaverRestore(args.load) if args.tot == 'train': if args.mp == 0: print("using simple trainer") launch_train_with_config(config, SimpleTrainer()) else: print( "can use simple (mp=0) trainer multi gpu parameter server or replicated" ) print( "for nccl as well as multiprocess distributed (mp=2) or multithreaded distributed (mp=else)" ) if args.nccl == 0: print(">>>> Using " + str(args.num_gpu) + " available GPU parameter server.") launch_train_with_config(config, SyncMultiGPUTrainer(args.num_gpu)) elif args.num_gpu and args.nccl != 0: print(">>>> Using " + str(args.num_gpu) + " available GPU for replicated training (nccl).")
"Note that it's best to keep per-GPU batch size in [32, 64] to obtain the best accuracy." "Pretrained models listed in README were trained with batch=32x8.") parser.add_argument('--mode', choices=['resnet', 'preact', 'se'], help='variants of resnet to use', default='resnet') args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu model = Model(args.depth, args.mode) model.data_format = args.data_format if args.eval: batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_on_ILSVRC12(model, get_model_loader(args.load), ds) else: if args.fake: logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd') else: logger.set_logger_dir( os.path.join('train_log', 'imagenet-{}-d{}'.format(args.mode, args.depth))) config = get_config(model, fake=args.fake) if args.load: config.session_init = get_model_loader(args.load) trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1)) launch_train_with_config(config, trainer)
help='systolic array width', type=int, default=256) parser.add_argument('--accumulator-array-height', help='accumulator array height', type=int, default=4096) parser.add_argument('--tensorpack-logdir-id', help='TensorPack training log directory id', type=str, default='') parser.add_argument('--mpusim-logdir', help='MPU simulator log directory', type=str, default='.') args = parser.parse_args() model = Model(args.resnet_depth, args.activations_datatype_size_byte, args.weights_datatype_size_byte, args.results_datatype_size_byte, args.systolic_array_height, args.systolic_array_width, args.accumulator_array_height, args.mpusim_logdir) logger.set_logger_dir( os.path.join( 'train_log', 'resnext_{}{}'.format(args.resnet_depth, args.tensorpack_logdir_id))) config = get_config(model) launch_train_with_config(config, SimpleTrainer())