Exemple #1
0
def train(args, logdir):

    # model
    # ;model = Net1()

    # dataflow
    # ;df = Net1DataFlow(hp.train1.data_path, hp.train1.batch_size)

    # set logger for event and model saver
    logger.set_logger_dir(logdir)

    session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(
        allow_growth=True, ), )

    train_conf = TrainConfig(
        # ;model=model,
        # ;data=QueueInput(df(n_prefetch=1000, n_thread=4)),
        callbacks=[
            ModelSaver(checkpoint_dir=logdir),
            # TODO EvalCallback()
        ],
        # ;max_epoch=hp.train1.num_epochs,
        # ;steps_per_epoch=hp.train1.steps_per_epoch,
        # session_config=session_conf
    )
    ckpt = '{}/{}'.format(
        logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir)
    if ckpt:
        train_conf.session_init = SaverRestore(ckpt)

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        train_conf.nr_tower = len(args.gpu.split(','))
Exemple #2
0
    def run_once(self, opt, sess_init=None, save_dir=None):
        ####
        train_datagen = self.get_datagen(opt['train_batch_size'], mode='train')
        valid_datagen = self.get_datagen(opt['infer_batch_size'], mode='valid')

        ###### must be called before ModelSaver
        if save_dir is None:
            logger.set_logger_dir(self.save_dir)
        else:
            logger.set_logger_dir(save_dir)

        ######
        model_flags = opt['model_flags']
        model = self.get_model()(**model_flags)
        ######
        callbacks = [
            ModelSaver(max_to_keep=1, keep_checkpoint_every_n_hours=None),
        ]

        for param_name, param_info in opt['manual_parameters'].items():
            model.add_manual_variable(param_name, param_info[0])
            callbacks.append(
                ScheduledHyperParamSetter(param_name, param_info[1]))
        # multi-GPU inference (with mandatory queue prefetch)
        infs = [StatCollector()]
        callbacks.append(
            DataParallelInferenceRunner(valid_datagen, infs,
                                        list(range(nr_gpus))))
        if self.model_mode == 'seg_gland':
            callbacks.append(MaxSaver('valid_dice_obj'))
        elif self.model_mode == 'seg_nuc':
            callbacks.append(MaxSaver('valid_dice_np'))
        else:
            callbacks.append(MaxSaver('valid_auc'))

        ######
        steps_per_epoch = train_datagen.size() // nr_gpus

        config = TrainConfig(
            model=model,
            callbacks=callbacks,
            dataflow=train_datagen,
            steps_per_epoch=steps_per_epoch,
            max_epoch=opt['nr_epochs'],
        )
        config.session_init = sess_init

        launch_train_with_config(config,
                                 SyncMultiGPUTrainerParameterServer(nr_gpus))
        tf.reset_default_graph(
        )  # remove the entire graph in case of multiple runs
        return
Exemple #3
0
    def run_once(self, opt, sess_init=None, save_dir=None):
        ####
        train_datagen = self.get_datagen(opt["train_batch_size"], mode="train")
        valid_datagen = self.get_datagen(opt["infer_batch_size"], mode="valid")

        ###### must be called before ModelSaver
        if save_dir is None:
            logger.set_logger_dir(self.save_dir)
        else:
            logger.set_logger_dir(save_dir)

        ######
        model_flags = opt["model_flags"]
        model = self.get_model()(**model_flags)
        ######
        callbacks = [
            # ModelSaver(max_to_keep=20), # TODO dynamic this
            ModelSaver(max_to_keep=opt["nr_epochs"]),
            # InjectShell(file='/tools/hover_net/src/config.yml', shell='ipython'),
        ]

        for param_name, param_info in opt["manual_parameters"].items():
            model.add_manual_variable(param_name, param_info[0])
            callbacks.append(ScheduledHyperParamSetter(param_name, param_info[1]))
        # multi-GPU inference (with mandatory queue prefetch)
        infs = [StatCollector()]
        callbacks.append(
            DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus)))
        )
        callbacks.append(MaxSaver("valid_dice"))

        ######
        steps_per_epoch = train_datagen.size() // nr_gpus

        config = TrainConfig(
            model=model,
            callbacks=callbacks,
            dataflow=train_datagen,
            steps_per_epoch=steps_per_epoch,
            max_epoch=opt["nr_epochs"],
        )
        config.session_init = sess_init

        launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus))
        tf.reset_default_graph()  # remove the entire graph in case of multiple runs
        # TODO: save
        return
Exemple #4
0
    def run_once(self, nr_gpus, freeze, sess_init=None, save_dir=None):
        ####
        train_datagen = self.get_datagen(mode='train')
        valid_datagen = self.get_datagen(mode='valid')

        ###### must be called before ModelSaver
        if save_dir is None:
            logger.set_logger_dir(self.save_dir)
        else:
            logger.set_logger_dir(save_dir)

        callbacks = [
            ModelSaver(max_to_keep=200),
            ScheduledHyperParamSetter('learning_rate', self.lr_sched),
        ]
        ######

        # multi-GPU inference (with mandatory queue prefetch)
        infs = [StatCollector()]
        callbacks.append(
            DataParallelInferenceRunner(valid_datagen, infs,
                                        list(range(nr_gpus))))

        ######
        steps_per_epoch = train_datagen.size() // nr_gpus

        MODEL_MAKER = Model_NP_XY if self.model_mode == 'np+xy' else Model_NP_DIST

        config = TrainConfig(
            model=MODEL_MAKER(freeze),
            callbacks=callbacks,
            dataflow=train_datagen,
            steps_per_epoch=steps_per_epoch,
            max_epoch=self.nr_epochs,
        )
        config.session_init = sess_init

        launch_train_with_config(config,
                                 SyncMultiGPUTrainerParameterServer(nr_gpus))
        tf.reset_default_graph(
        )  # remove the entire graph in case of multiple runs
        return
Exemple #5
0
def train_child(model_cls, args, log_dir, child_dir, prev_dir):
    """
    """
    if not os.path.exists(child_dir):
        os.mkdir(child_dir)

    if os.path.basename(child_dir) == "0" and args.use_init_model:
        init_model_dir = os.path.join(args.data_dir, 'init_model',
                                      args.ds_name)
        if os.path.exists(init_model_dir):
            # This implies that there exists init_model_dir, and we are in first model
            # so we do not need to train. Copy the model and mark finished
            logger.info("Skip first model as this model is fully trained.")
            cmd = "mkdir -p {cdir} ; cp {pdir}/* {cdir}/ ".format(\
                cdir=child_dir, pdir=args.init_model_dir)
            _ = subprocess.check_output(cmd, shell=True)
            return

    # get training params for train-config
    (model, args, starting_epoch, lr_schedule, ds_train, insrc_train,
     train_cbs) = get_training_params(model_cls, args)

    ## Model callbacks
    # loss weight update
    ls_cbs_func = getattr(model, 'compute_loss_select_callbacks', None)
    if callable(ls_cbs_func):
        train_cbs.extend(ls_cbs_func())
    # extra callback for general logging/ update.
    extra_callbacks = DEFAULT_CALLBACKS()
    if not args.do_remote_child_inf_runner:
        extra_callbacks = \
            [ecb for ecb in extra_callbacks if not isinstance(ecb, ProgressBar)]
    logger.info("Extra callbacks are {}".format(
        [ecb.__class__ for ecb in extra_callbacks]))

    # Logging for analysis
    model_str = model.net_info.to_str()
    logger.info('LayerInfoListString is :\n {}'.format(model_str))
    train_callbacks = [
        ModelSaver(checkpoint_dir=child_dir,
                   max_to_keep=1,
                   keep_checkpoint_every_n_hours=100),
    ] + train_cbs
    if lr_schedule:
        train_callbacks.append(
            ScheduledHyperParamSetter('learning_rate', lr_schedule))
    logger.info('The updated params for training is \n{}'.format(args))
    config = TrainConfig(
        data=insrc_train,
        dataflow=ds_train,
        callbacks=train_callbacks,
        extra_callbacks=extra_callbacks,
        model=model,
        monitors=[JSONWriter(), ScalarPrinter()],  #, TFEventWriter()],
        steps_per_epoch=args.steps_per_epoch,
        max_epoch=args.max_epoch,
        starting_epoch=starting_epoch)
    for dn in [child_dir, prev_dir]:
        if dn is None:
            continue
        ckpt = tf.train.latest_checkpoint(dn)
        if ckpt:
            if args.search_cat_based:
                restore_cls = SaverRestoreSizeRelaxed
            else:
                restore_cls = SaverRestore
            _ignore = [DYNAMIC_WEIGHTS_NAME]
            _sess_init_load = restore_cls(ckpt, ignore=_ignore)
            if dn == child_dir:
                # loading from self keep global step
                config.session_init = _sess_init_load
            else:
                # loading from others. Set global_step to 0
                config.session_init = ChainInit([
                    _sess_init_load,
                    AssignGlobalStep(0),
                ])
            break
    launch_train_with_config(config,
                             SyncMultiGPUTrainerParameterServer(args.nr_gpu))
    return model
Exemple #6
0
def critic_train(ctrl,
                 data,
                 log_dir,
                 model_dir,
                 prev_dir,
                 vs_name,
                 split_train_val=False):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    lr_schedule = []
    max_epoch = ctrl.critic_train_epoch
    lr = ctrl.critic_init_lr
    for epoch in range(0, max_epoch):
        if epoch % 1 == 0:
            lr_schedule.append((epoch + 1, lr))
            lr *= 0.9
    ds_size = len(data[0])
    idxs = list(range(ds_size))
    np.random.shuffle(idxs)

    if split_train_val:
        train_size = ds_size * 9 // 10
        if train_size == 0:
            train_size = ds_size
        val_start = train_size
    else:
        train_size = ds_size
        val_start = ds_size * 9 // 10
    if ds_size - val_start == 0:
        val_start = 0

    data_train = [[col[k] for k in idxs[:train_size]] for col in data]
    data_val = [[col[k] for k in idxs[val_start:]] for col in data]

    model = critic_factory(ctrl, is_train=True, vs_name=vs_name)
    ds_train = critic_dataflow_factory(ctrl, data_train, is_train=True)
    ds_val = critic_dataflow_factory(ctrl, data_val, is_train=False)
    session_config = None
    device = 0
    if ctrl.critic_type == CriticTypes.LSTM:
        session_config = tf.ConfigProto(device_count={'GPU': 0})
        device = -1
    extra_callbacks = DEFAULT_CALLBACKS()
    extra_callbacks = list(
        filter(lambda x: not isinstance(x, ProgressBar), extra_callbacks))
    logger.info("Extra callbacks are {}".format(
        list(map(lambda x: x.__class__, extra_callbacks))))
    # Put this into callbacks for in-training validation/inferencing
    inference_callback = InferenceRunner(
        ds_val, [ScalarStats('{}/cost'.format(vs_name))], device=device)
    config = TrainConfig(
        dataflow=ds_train,
        callbacks=[
            ModelSaver(checkpoint_dir=model_dir,
                       max_to_keep=1,
                       keep_checkpoint_every_n_hours=100),
            ScheduledHyperParamSetter('learning_rate', lr_schedule)
        ],
        extra_callbacks=extra_callbacks,
        model=model,
        monitors=[JSONWriter(), ScalarPrinter()],  #, TFEventWriter()],
        steps_per_epoch=ds_train.size(),
        max_epoch=max_epoch,
        session_config=session_config)
    ckpt = tf.train.latest_checkpoint(prev_dir if prev_dir else model_dir)
    if ckpt:
        config.session_init = SaverRestore(ckpt)
    launch_train_with_config(config, SimpleTrainer())