def train(args, logdir): # model # ;model = Net1() # dataflow # ;df = Net1DataFlow(hp.train1.data_path, hp.train1.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir) session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True, ), ) train_conf = TrainConfig( # ;model=model, # ;data=QueueInput(df(n_prefetch=1000, n_thread=4)), callbacks=[ ModelSaver(checkpoint_dir=logdir), # TODO EvalCallback() ], # ;max_epoch=hp.train1.num_epochs, # ;steps_per_epoch=hp.train1.steps_per_epoch, # session_config=session_conf ) ckpt = '{}/{}'.format( logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(','))
def run_once(self, opt, sess_init=None, save_dir=None): #### train_datagen = self.get_datagen(opt['train_batch_size'], mode='train') valid_datagen = self.get_datagen(opt['infer_batch_size'], mode='valid') ###### must be called before ModelSaver if save_dir is None: logger.set_logger_dir(self.save_dir) else: logger.set_logger_dir(save_dir) ###### model_flags = opt['model_flags'] model = self.get_model()(**model_flags) ###### callbacks = [ ModelSaver(max_to_keep=1, keep_checkpoint_every_n_hours=None), ] for param_name, param_info in opt['manual_parameters'].items(): model.add_manual_variable(param_name, param_info[0]) callbacks.append( ScheduledHyperParamSetter(param_name, param_info[1])) # multi-GPU inference (with mandatory queue prefetch) infs = [StatCollector()] callbacks.append( DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus)))) if self.model_mode == 'seg_gland': callbacks.append(MaxSaver('valid_dice_obj')) elif self.model_mode == 'seg_nuc': callbacks.append(MaxSaver('valid_dice_np')) else: callbacks.append(MaxSaver('valid_auc')) ###### steps_per_epoch = train_datagen.size() // nr_gpus config = TrainConfig( model=model, callbacks=callbacks, dataflow=train_datagen, steps_per_epoch=steps_per_epoch, max_epoch=opt['nr_epochs'], ) config.session_init = sess_init launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus)) tf.reset_default_graph( ) # remove the entire graph in case of multiple runs return
def run_once(self, opt, sess_init=None, save_dir=None): #### train_datagen = self.get_datagen(opt["train_batch_size"], mode="train") valid_datagen = self.get_datagen(opt["infer_batch_size"], mode="valid") ###### must be called before ModelSaver if save_dir is None: logger.set_logger_dir(self.save_dir) else: logger.set_logger_dir(save_dir) ###### model_flags = opt["model_flags"] model = self.get_model()(**model_flags) ###### callbacks = [ # ModelSaver(max_to_keep=20), # TODO dynamic this ModelSaver(max_to_keep=opt["nr_epochs"]), # InjectShell(file='/tools/hover_net/src/config.yml', shell='ipython'), ] for param_name, param_info in opt["manual_parameters"].items(): model.add_manual_variable(param_name, param_info[0]) callbacks.append(ScheduledHyperParamSetter(param_name, param_info[1])) # multi-GPU inference (with mandatory queue prefetch) infs = [StatCollector()] callbacks.append( DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus))) ) callbacks.append(MaxSaver("valid_dice")) ###### steps_per_epoch = train_datagen.size() // nr_gpus config = TrainConfig( model=model, callbacks=callbacks, dataflow=train_datagen, steps_per_epoch=steps_per_epoch, max_epoch=opt["nr_epochs"], ) config.session_init = sess_init launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus)) tf.reset_default_graph() # remove the entire graph in case of multiple runs # TODO: save return
def run_once(self, nr_gpus, freeze, sess_init=None, save_dir=None): #### train_datagen = self.get_datagen(mode='train') valid_datagen = self.get_datagen(mode='valid') ###### must be called before ModelSaver if save_dir is None: logger.set_logger_dir(self.save_dir) else: logger.set_logger_dir(save_dir) callbacks = [ ModelSaver(max_to_keep=200), ScheduledHyperParamSetter('learning_rate', self.lr_sched), ] ###### # multi-GPU inference (with mandatory queue prefetch) infs = [StatCollector()] callbacks.append( DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus)))) ###### steps_per_epoch = train_datagen.size() // nr_gpus MODEL_MAKER = Model_NP_XY if self.model_mode == 'np+xy' else Model_NP_DIST config = TrainConfig( model=MODEL_MAKER(freeze), callbacks=callbacks, dataflow=train_datagen, steps_per_epoch=steps_per_epoch, max_epoch=self.nr_epochs, ) config.session_init = sess_init launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus)) tf.reset_default_graph( ) # remove the entire graph in case of multiple runs return
def train_child(model_cls, args, log_dir, child_dir, prev_dir): """ """ if not os.path.exists(child_dir): os.mkdir(child_dir) if os.path.basename(child_dir) == "0" and args.use_init_model: init_model_dir = os.path.join(args.data_dir, 'init_model', args.ds_name) if os.path.exists(init_model_dir): # This implies that there exists init_model_dir, and we are in first model # so we do not need to train. Copy the model and mark finished logger.info("Skip first model as this model is fully trained.") cmd = "mkdir -p {cdir} ; cp {pdir}/* {cdir}/ ".format(\ cdir=child_dir, pdir=args.init_model_dir) _ = subprocess.check_output(cmd, shell=True) return # get training params for train-config (model, args, starting_epoch, lr_schedule, ds_train, insrc_train, train_cbs) = get_training_params(model_cls, args) ## Model callbacks # loss weight update ls_cbs_func = getattr(model, 'compute_loss_select_callbacks', None) if callable(ls_cbs_func): train_cbs.extend(ls_cbs_func()) # extra callback for general logging/ update. extra_callbacks = DEFAULT_CALLBACKS() if not args.do_remote_child_inf_runner: extra_callbacks = \ [ecb for ecb in extra_callbacks if not isinstance(ecb, ProgressBar)] logger.info("Extra callbacks are {}".format( [ecb.__class__ for ecb in extra_callbacks])) # Logging for analysis model_str = model.net_info.to_str() logger.info('LayerInfoListString is :\n {}'.format(model_str)) train_callbacks = [ ModelSaver(checkpoint_dir=child_dir, max_to_keep=1, keep_checkpoint_every_n_hours=100), ] + train_cbs if lr_schedule: train_callbacks.append( ScheduledHyperParamSetter('learning_rate', lr_schedule)) logger.info('The updated params for training is \n{}'.format(args)) config = TrainConfig( data=insrc_train, dataflow=ds_train, callbacks=train_callbacks, extra_callbacks=extra_callbacks, model=model, monitors=[JSONWriter(), ScalarPrinter()], #, TFEventWriter()], steps_per_epoch=args.steps_per_epoch, max_epoch=args.max_epoch, starting_epoch=starting_epoch) for dn in [child_dir, prev_dir]: if dn is None: continue ckpt = tf.train.latest_checkpoint(dn) if ckpt: if args.search_cat_based: restore_cls = SaverRestoreSizeRelaxed else: restore_cls = SaverRestore _ignore = [DYNAMIC_WEIGHTS_NAME] _sess_init_load = restore_cls(ckpt, ignore=_ignore) if dn == child_dir: # loading from self keep global step config.session_init = _sess_init_load else: # loading from others. Set global_step to 0 config.session_init = ChainInit([ _sess_init_load, AssignGlobalStep(0), ]) break launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(args.nr_gpu)) return model
def critic_train(ctrl, data, log_dir, model_dir, prev_dir, vs_name, split_train_val=False): if not os.path.exists(model_dir): os.makedirs(model_dir) lr_schedule = [] max_epoch = ctrl.critic_train_epoch lr = ctrl.critic_init_lr for epoch in range(0, max_epoch): if epoch % 1 == 0: lr_schedule.append((epoch + 1, lr)) lr *= 0.9 ds_size = len(data[0]) idxs = list(range(ds_size)) np.random.shuffle(idxs) if split_train_val: train_size = ds_size * 9 // 10 if train_size == 0: train_size = ds_size val_start = train_size else: train_size = ds_size val_start = ds_size * 9 // 10 if ds_size - val_start == 0: val_start = 0 data_train = [[col[k] for k in idxs[:train_size]] for col in data] data_val = [[col[k] for k in idxs[val_start:]] for col in data] model = critic_factory(ctrl, is_train=True, vs_name=vs_name) ds_train = critic_dataflow_factory(ctrl, data_train, is_train=True) ds_val = critic_dataflow_factory(ctrl, data_val, is_train=False) session_config = None device = 0 if ctrl.critic_type == CriticTypes.LSTM: session_config = tf.ConfigProto(device_count={'GPU': 0}) device = -1 extra_callbacks = DEFAULT_CALLBACKS() extra_callbacks = list( filter(lambda x: not isinstance(x, ProgressBar), extra_callbacks)) logger.info("Extra callbacks are {}".format( list(map(lambda x: x.__class__, extra_callbacks)))) # Put this into callbacks for in-training validation/inferencing inference_callback = InferenceRunner( ds_val, [ScalarStats('{}/cost'.format(vs_name))], device=device) config = TrainConfig( dataflow=ds_train, callbacks=[ ModelSaver(checkpoint_dir=model_dir, max_to_keep=1, keep_checkpoint_every_n_hours=100), ScheduledHyperParamSetter('learning_rate', lr_schedule) ], extra_callbacks=extra_callbacks, model=model, monitors=[JSONWriter(), ScalarPrinter()], #, TFEventWriter()], steps_per_epoch=ds_train.size(), max_epoch=max_epoch, session_config=session_config) ckpt = tf.train.latest_checkpoint(prev_dir if prev_dir else model_dir) if ckpt: config.session_init = SaverRestore(ckpt) launch_train_with_config(config, SimpleTrainer())