Esempio n. 1
0
def train_vqvae(params, dataset, checkpoint_dir):
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']
    image_shape = model_params['image_shape']

    train_ds, val_ds, sample_train, sample_test = load_toy_dataset(
        dataset, trainer_params['batch_size'],
        trainer_params['num_parallel'])

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BaseVQVAE.from_params(model_params)

    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            Reconstruct(model, sample_train, sample_test,
                        os.path.join(checkpoint_dir, 'images')),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            InferenceRunner(input=val_ds,
                            infs=ScalarStats(['loss', 'perplexity'])),
            MaxSaver(monitor_stat='validation_loss'),
            CompressResource(os.path.join(checkpoint_dir, 'images'),
                             os.path.join(checkpoint_dir, 'images.zip'))
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, SimpleTrainer())
Esempio n. 2
0
def train_pixelcnn_prior(params, checkpoint_dir, recover=True, force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds, val_ds, sample_train, sample_val, sample_train_label, \
        sample_val_label = get_dataflow(
            dataset_params['path'], False,
            dataset_params['train_val_split'], trainer_params['batch_size'],
            trainer_params['num_parallel'])

    vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path']
    vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0],
                                     'config.json')
    model_params['vqvae_model_params'] = vqvae_config_path

    latent_shape = model_params['latent_shape']
    num_labels = model_params['num_labels']

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BasePixelCNNPrior.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            SequentialSampling(trainer_params['num_examples_to_generate'],
                               latent_shape, num_labels, model,
                               os.path.join(checkpoint_dir, 'images')),
            Reconstruct(model, sample_train, sample_val,
                        os.path.join(checkpoint_dir, 'images'),
                        sample_train_label, sample_val_label),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            InferenceRunner(input=val_ds,
                            infs=ScalarStats(['loss'])),
            MinSaver(monitor_stat='validation_loss'),
            CompressResource(os.path.join(checkpoint_dir, 'images'),
                             os.path.join(checkpoint_dir, 'images.zip')),
            RestoreWeights(vqvae_checkpoint_path),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Esempio n. 3
0
    def run_once(self, opt, sess_init=None, save_dir=None):
        ####
        train_datagen = self.get_datagen(opt['train_batch_size'], mode='train')
        valid_datagen = self.get_datagen(opt['infer_batch_size'], mode='valid')

        ###### must be called before ModelSaver
        if save_dir is None:
            logger.set_logger_dir(self.save_dir)
        else:
            logger.set_logger_dir(save_dir)

        ######
        model_flags = opt['model_flags']
        model = self.get_model()(**model_flags)
        ######
        callbacks = [
            ModelSaver(max_to_keep=1, keep_checkpoint_every_n_hours=None),
        ]

        for param_name, param_info in opt['manual_parameters'].items():
            model.add_manual_variable(param_name, param_info[0])
            callbacks.append(
                ScheduledHyperParamSetter(param_name, param_info[1]))
        # multi-GPU inference (with mandatory queue prefetch)
        infs = [StatCollector()]
        callbacks.append(
            DataParallelInferenceRunner(valid_datagen, infs,
                                        list(range(nr_gpus))))
        if self.model_mode == 'seg_gland':
            callbacks.append(MaxSaver('valid_dice_obj'))
        elif self.model_mode == 'seg_nuc':
            callbacks.append(MaxSaver('valid_dice_np'))
        else:
            callbacks.append(MaxSaver('valid_auc'))

        ######
        steps_per_epoch = train_datagen.size() // nr_gpus

        config = TrainConfig(
            model=model,
            callbacks=callbacks,
            dataflow=train_datagen,
            steps_per_epoch=steps_per_epoch,
            max_epoch=opt['nr_epochs'],
        )
        config.session_init = sess_init

        launch_train_with_config(config,
                                 SyncMultiGPUTrainerParameterServer(nr_gpus))
        tf.reset_default_graph(
        )  # remove the entire graph in case of multiple runs
        return
Esempio n. 4
0
def run(args):
    num_gpus = get_nr_gpu()
    num_towers = max(num_gpus, 1)

    config = get_config(args, AvatarSynthModel(args), num_gpus, num_towers)

    if args.load_path:
        config.session_init = SaverRestore(args.load_path)

    # trainer = SyncMultiGPUTrainerParameterServer(num_towers)
    # trainer = QueueInputTrainer()
    trainer = SyncMultiGPUTrainerReplicated(num_towers)
    launch_train_with_config(config, trainer)
Esempio n. 5
0
def train_image_embedding_softmax(params, checkpoint_dir, recover=True,
                                  force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds, val_ds, _, _, _, _ = get_dataflow(
        dataset_params['path'], False, dataset_params['train_val_split'],
        trainer_params['batch_size'], trainer_params['num_parallel'])

    vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path']
    vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0],
                                     'config.json')
    model_params['vqvae_model_params'] = vqvae_config_path

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BaseImageEmbedding.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            InferenceRunner(input=val_ds, infs=[
                ScalarStats('loss'),
                ClassificationError('correct_prediction',
                                    'val-correct_prediction')]),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            MinSaver(monitor_stat='val-correct_prediction'),
            RestoreWeights(vqvae_checkpoint_path),
            SendStat('Training status', [
                'loss', 'accuracy',
                'validation_loss', 'val-correct_prediction'],
                after_every=2),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Esempio n. 6
0
    def run_once(self, opt, sess_init=None, save_dir=None):
        ####
        train_datagen = self.get_datagen(opt["train_batch_size"], mode="train")
        valid_datagen = self.get_datagen(opt["infer_batch_size"], mode="valid")

        ###### must be called before ModelSaver
        if save_dir is None:
            logger.set_logger_dir(self.save_dir)
        else:
            logger.set_logger_dir(save_dir)

        ######
        model_flags = opt["model_flags"]
        model = self.get_model()(**model_flags)
        ######
        callbacks = [
            # ModelSaver(max_to_keep=20), # TODO dynamic this
            ModelSaver(max_to_keep=opt["nr_epochs"]),
            # InjectShell(file='/tools/hover_net/src/config.yml', shell='ipython'),
        ]

        for param_name, param_info in opt["manual_parameters"].items():
            model.add_manual_variable(param_name, param_info[0])
            callbacks.append(ScheduledHyperParamSetter(param_name, param_info[1]))
        # multi-GPU inference (with mandatory queue prefetch)
        infs = [StatCollector()]
        callbacks.append(
            DataParallelInferenceRunner(valid_datagen, infs, list(range(nr_gpus)))
        )
        callbacks.append(MaxSaver("valid_dice"))

        ######
        steps_per_epoch = train_datagen.size() // nr_gpus

        config = TrainConfig(
            model=model,
            callbacks=callbacks,
            dataflow=train_datagen,
            steps_per_epoch=steps_per_epoch,
            max_epoch=opt["nr_epochs"],
        )
        config.session_init = sess_init

        launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_gpus))
        tf.reset_default_graph()  # remove the entire graph in case of multiple runs
        # TODO: save
        return
Esempio n. 7
0
def run(model):
    instance = Model(model, model.conf.data_format)
    if not model.conf.is_train:
        batch = 64
        dataset = get_data(model.conf.data_dir, 'val', batch)
        eval_on_ILSVRC12(
            instance,
            get_model_loader(model.conf.logdir + '/' + model.conf.test_step),
            dataset)
    else:
        logger.set_logger_dir(os.path.join(model.conf.logdir))
        config = get_config(instance, model.conf)
        if model.conf.reload_step:
            config.session_init = get_model_loader(model.conf.logdir + '/' +
                                                   model.conf.reload_step)
        trainer = SyncMultiGPUTrainerParameterServer(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
Esempio n. 8
0
def train_vae(params, checkpoint_dir, recover=True, force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds, val_ds, sample_train, sample_val, _, _ = \
        get_dataflow(dataset_params['path'],
                     dataset_params['binarizer'],
                     dataset_params['train_val_split'],
                     trainer_params['batch_size'],
                     trainer_params['num_parallel'])

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    latent_dim = model_params['latent_dim']
    model = BaseVAE.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            Sampling(model, trainer_params['num_examples_to_generate'],
                     latent_dim, os.path.join(checkpoint_dir, 'images')),
            Reconstruct(model, sample_train, sample_val,
                        os.path.join(checkpoint_dir, 'images')),
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            InferenceRunner(input=val_ds,
                            infs=ScalarStats(['avg_logpx_z', 'neg_elbo'])),
            MinSaver(monitor_stat='validation_neg_elbo'),
            CompressResource(os.path.join(checkpoint_dir, 'images'),
                             os.path.join(checkpoint_dir, 'images.zip')),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Esempio n. 9
0
    def run_once(self, nr_gpus, freeze, sess_init=None, save_dir=None):
        ####
        train_datagen = self.get_datagen(mode='train')
        valid_datagen = self.get_datagen(mode='valid')

        ###### must be called before ModelSaver
        if save_dir is None:
            logger.set_logger_dir(self.save_dir)
        else:
            logger.set_logger_dir(save_dir)

        callbacks = [
            ModelSaver(max_to_keep=200),
            ScheduledHyperParamSetter('learning_rate', self.lr_sched),
        ]
        ######

        # multi-GPU inference (with mandatory queue prefetch)
        infs = [StatCollector()]
        callbacks.append(
            DataParallelInferenceRunner(valid_datagen, infs,
                                        list(range(nr_gpus))))

        ######
        steps_per_epoch = train_datagen.size() // nr_gpus

        MODEL_MAKER = Model_NP_XY if self.model_mode == 'np+xy' else Model_NP_DIST

        config = TrainConfig(
            model=MODEL_MAKER(freeze),
            callbacks=callbacks,
            dataflow=train_datagen,
            steps_per_epoch=steps_per_epoch,
            max_epoch=self.nr_epochs,
        )
        config.session_init = sess_init

        launch_train_with_config(config,
                                 SyncMultiGPUTrainerParameterServer(nr_gpus))
        tf.reset_default_graph(
        )  # remove the entire graph in case of multiple runs
        return
Esempio n. 10
0
def train_image_embedding_triplet(params, checkpoint_dir, recover=True,
                                  force=False):
    if force and os.path.exists(checkpoint_dir):
        shutil.rmtree(checkpoint_dir)
    logger.set_logger_dir(checkpoint_dir)

    dataset_params = params['dataset']
    model_params = params['model']
    trainer_params = params['trainer']

    train_ds = get_triplet_dataflow(
        dataset_params['path'], trainer_params['items_per_batch'],
        trainer_params['images_per_item'], trainer_params['num_parallel'])

    vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path']
    vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0],
                                     'config.json')
    model_params['vqvae_model_params'] = vqvae_config_path

    params.to_file(os.path.join(logger.get_logger_dir(), 'config.json'))

    model = BaseImageEmbedding.from_params(model_params)

    trainer = SyncMultiGPUTrainerParameterServer(
        gpus=trainer_params['num_gpus'], ps_device=None)
    trainer_config = AutoResumeTrainConfig(
        always_resume=recover,
        model=model,
        dataflow=train_ds,
        callbacks=[
            ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),
            MinSaver(monitor_stat='loss'),
            RestoreWeights(vqvae_checkpoint_path),
            SendStat('Training status', ['loss', 'pos_triplet_frac'],
                     after_every=2),
            Notification('Training status', 'Complete')
        ],
        steps_per_epoch=trainer_params['steps_per_epoch'],
        max_epoch=trainer_params['max_epochs']
    )
    launch_train_with_config(trainer_config, trainer)
Esempio n. 11
0
    def train(self, args):
        self.args = args
        # Make sure the save path exist
        if not os.path.exists(self.args.save):
            os.makedirs(self.args.save)

        with change_gpu(self.args.gpu):
            train_df = self._dataflow()
            trainer = (SimpleTrainer() if get_num_gpu() <= 1 else
                       SyncMultiGPUTrainerReplicated(get_num_gpu()))
            print("Found %d gpus. Using trainer:" % get_num_gpu(), trainer)
            # Setup callbacks
            self._default_callbacks()
            try:
                launch_train_with_config(
                    self.pred_config(self.args, train_df, self.callbacks),
                    trainer)
            except Exception as error:
                traceback.print_exc()
            else:
                # If everythin worked save a compated model
                self.export(os.path.join(self.args.save, "compact.pb"))
Esempio n. 12
0
def critic_train(ctrl,
                 data,
                 log_dir,
                 model_dir,
                 prev_dir,
                 vs_name,
                 split_train_val=False):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    lr_schedule = []
    max_epoch = ctrl.critic_train_epoch
    lr = ctrl.critic_init_lr
    for epoch in range(0, max_epoch):
        if epoch % 1 == 0:
            lr_schedule.append((epoch + 1, lr))
            lr *= 0.9
    ds_size = len(data[0])
    idxs = list(range(ds_size))
    np.random.shuffle(idxs)

    if split_train_val:
        train_size = ds_size * 9 // 10
        if train_size == 0:
            train_size = ds_size
        val_start = train_size
    else:
        train_size = ds_size
        val_start = ds_size * 9 // 10
    if ds_size - val_start == 0:
        val_start = 0

    data_train = [[col[k] for k in idxs[:train_size]] for col in data]
    data_val = [[col[k] for k in idxs[val_start:]] for col in data]

    model = critic_factory(ctrl, is_train=True, vs_name=vs_name)
    ds_train = critic_dataflow_factory(ctrl, data_train, is_train=True)
    ds_val = critic_dataflow_factory(ctrl, data_val, is_train=False)
    session_config = None
    device = 0
    if ctrl.critic_type == CriticTypes.LSTM:
        session_config = tf.ConfigProto(device_count={'GPU': 0})
        device = -1
    extra_callbacks = DEFAULT_CALLBACKS()
    extra_callbacks = list(
        filter(lambda x: not isinstance(x, ProgressBar), extra_callbacks))
    logger.info("Extra callbacks are {}".format(
        list(map(lambda x: x.__class__, extra_callbacks))))
    # Put this into callbacks for in-training validation/inferencing
    inference_callback = InferenceRunner(
        ds_val, [ScalarStats('{}/cost'.format(vs_name))], device=device)
    config = TrainConfig(
        dataflow=ds_train,
        callbacks=[
            ModelSaver(checkpoint_dir=model_dir,
                       max_to_keep=1,
                       keep_checkpoint_every_n_hours=100),
            ScheduledHyperParamSetter('learning_rate', lr_schedule)
        ],
        extra_callbacks=extra_callbacks,
        model=model,
        monitors=[JSONWriter(), ScalarPrinter()],  #, TFEventWriter()],
        steps_per_epoch=ds_train.size(),
        max_epoch=max_epoch,
        session_config=session_config)
    ckpt = tf.train.latest_checkpoint(prev_dir if prev_dir else model_dir)
    if ckpt:
        config.session_init = SaverRestore(ckpt)
    launch_train_with_config(config, SimpleTrainer())
Esempio n. 13
0
def train(args, cfg):
    out_dirs = gen_outdirs(args, "tp")
    output_dir, out_res_dir = out_dirs["output_dir"], out_dirs["out_res_dir"]
    df = PneuSegDF(args.mode, out_res_dir, args.train_dir, args.testset_dir,
                   args.min_num_workers, cfg)
    num_gpu = max(get_num_gpu(), 1)
    ds = df.prepared(num_gpu, cfg.batch_size)

    # Avoid overwritting config file
    if os.path.exists(pj(output_dir, os.path.basename(args.config))):
        input(
            "Config file will NOT be overwritten. Press Enter to continue...")
    else:
        shutil.copy(args.config, output_dir)
    logger.set_logger_dir(pj(output_dir, "log"))
    callback_list = [
        # PeriodicCallback overwritten the frequency of what's wrapped
        PeriodicCallback(ModelSaver(50, checkpoint_dir=output_dir),
                         every_k_epochs=1),
        GPUUtilizationTracker(),
        MergeAllSummaries(1 if args.train_debug else 0),
        # ProgressBar(["Loss"])
    ]
    if cfg.network["norm_layer"] == "BN_layers":
        callback_list.append(BN_layers_update())
    if cfg.lr_schedule["type"] == "epoch_wise_constant":
        schedule = [(ep, lr / num_gpu) for ep, lr in zip(
            [0] + cfg.lr_schedule["epoch_to_drop_lr"], cfg.lr_schedule["lr"])]
        callback_list.append(
            ScheduledHyperParamSetter("learning_rate", schedule))
    elif cfg.lr_schedule["type"] == "halved":
        schedule = [(0, cfg.lr_schedule["init_lr"])]
        for i in range(cfg.lr_schedule["first_epoch2drop"], cfg.max_epoch,
                       cfg.lr_schedule["period"]):
            schedule.append(
                (i, schedule[int((i - cfg.lr_schedule["first_epoch2drop"]) /
                                 cfg.lr_schedule["period"])][1] /
                 (cfg.lr_schedule["decay_rate"] * num_gpu)))
        print(schedule)
        callback_list.append(
            ScheduledHyperParamSetter("learning_rate", schedule))
    steps_per_epoch = len(ds) // num_gpu + 1
    train_cfg = TrainConfig(
        model=Tensorpack_model(cfg, steps_per_epoch),
        data=QueueInput(ds),
        steps_per_epoch=steps_per_epoch,
        callbacks=callback_list,
        monitors=[
            # ScalarPrinter(True, whitelist=["Loss", "LR"]),
            ScalarPrinter(True),
            # ScalarPrinter(),
            TFEventWriter(),
            # JSONWriter()
        ],
        max_epoch=cfg.max_epoch,
        session_init=SmartInit(args.resume),
        starting_epoch=args.resume_epoch)
    launch_train_with_config(
        train_cfg,
        SyncMultiGPUTrainerReplicated(num_gpu)
        if num_gpu > 1 else SimpleTrainer())
Esempio n. 14
0
def train_child(model_cls, args, log_dir, child_dir, prev_dir):
    """
    """
    if not os.path.exists(child_dir):
        os.mkdir(child_dir)

    if os.path.basename(child_dir) == "0" and args.use_init_model:
        init_model_dir = os.path.join(args.data_dir, 'init_model',
                                      args.ds_name)
        if os.path.exists(init_model_dir):
            # This implies that there exists init_model_dir, and we are in first model
            # so we do not need to train. Copy the model and mark finished
            logger.info("Skip first model as this model is fully trained.")
            cmd = "mkdir -p {cdir} ; cp {pdir}/* {cdir}/ ".format(\
                cdir=child_dir, pdir=args.init_model_dir)
            _ = subprocess.check_output(cmd, shell=True)
            return

    # get training params for train-config
    (model, args, starting_epoch, lr_schedule, ds_train, insrc_train,
     train_cbs) = get_training_params(model_cls, args)

    ## Model callbacks
    # loss weight update
    ls_cbs_func = getattr(model, 'compute_loss_select_callbacks', None)
    if callable(ls_cbs_func):
        train_cbs.extend(ls_cbs_func())
    # extra callback for general logging/ update.
    extra_callbacks = DEFAULT_CALLBACKS()
    if not args.do_remote_child_inf_runner:
        extra_callbacks = \
            [ecb for ecb in extra_callbacks if not isinstance(ecb, ProgressBar)]
    logger.info("Extra callbacks are {}".format(
        [ecb.__class__ for ecb in extra_callbacks]))

    # Logging for analysis
    model_str = model.net_info.to_str()
    logger.info('LayerInfoListString is :\n {}'.format(model_str))
    train_callbacks = [
        ModelSaver(checkpoint_dir=child_dir,
                   max_to_keep=1,
                   keep_checkpoint_every_n_hours=100),
    ] + train_cbs
    if lr_schedule:
        train_callbacks.append(
            ScheduledHyperParamSetter('learning_rate', lr_schedule))
    logger.info('The updated params for training is \n{}'.format(args))
    config = TrainConfig(
        data=insrc_train,
        dataflow=ds_train,
        callbacks=train_callbacks,
        extra_callbacks=extra_callbacks,
        model=model,
        monitors=[JSONWriter(), ScalarPrinter()],  #, TFEventWriter()],
        steps_per_epoch=args.steps_per_epoch,
        max_epoch=args.max_epoch,
        starting_epoch=starting_epoch)
    for dn in [child_dir, prev_dir]:
        if dn is None:
            continue
        ckpt = tf.train.latest_checkpoint(dn)
        if ckpt:
            if args.search_cat_based:
                restore_cls = SaverRestoreSizeRelaxed
            else:
                restore_cls = SaverRestore
            _ignore = [DYNAMIC_WEIGHTS_NAME]
            _sess_init_load = restore_cls(ckpt, ignore=_ignore)
            if dn == child_dir:
                # loading from self keep global step
                config.session_init = _sess_init_load
            else:
                # loading from others. Set global_step to 0
                config.session_init = ChainInit([
                    _sess_init_load,
                    AssignGlobalStep(0),
                ])
            break
    launch_train_with_config(config,
                             SyncMultiGPUTrainerParameterServer(args.nr_gpu))
    return model
    #  默认是Namespace(batch=256, data=None, data_format='NCHW', depth=50, eval=False, fake=False, gpu=None, load=None, log_dir='', mode='resnet')
    # 有GPU就用os.environ获得系统gpu信息
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    # 根据深度和模式生成一个残差网络,# model的输出方式默认为 NCHW
    model = Model(
        args.depth, args.mode
    )  # 前面的class Model(ImageNetModel) def __init__(self, depth, mode='resnet')
    model.data_format = args.data_format
    # 如果是测试的话(eval),输出错误率
    if args.eval:
        batch = 128  # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)  # 输出错误率
    # 如果是训练的话,记录日志并开始学习
    else:
        if args.fake:  # 如果使用fakedata测试或基准测试此模型
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            log_foder = '/data0/wangguangrun/log_acnt/imagenet-resnet-%s' % (
                args.log_dir)
            logger.set_logger_dir(os.path.join(log_foder))  # 保存路径?
        config = get_config(model, fake=args.fake)  # 得到参数
        # 如果要加载模型的话
        if args.load:
            config.session_init = get_model_loader(args.load)
        # 所有GPU一起开始训练
        trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1))
        launch_train_with_config(
            config, trainer)  # 最终训练的时候用到了get_optimizer 和 build_graph
Esempio n. 16
0
                        type=str, default='NCHW')
    parser.add_argument('-d', '--depth', help='resnet depth',
                        type=int, default=18, choices=[18, 34, 50, 101, 152])
    parser.add_argument('--eval', action='store_true')
    parser.add_argument('--batch', default=256, type=int,
                        help='total batch size. 32 per GPU gives best accuracy, higher values should be similarly good')
    parser.add_argument('--mode', choices=['resnet', 'preact', 'se'],
                        help='variants of resnet to use', default='resnet')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    model = Model(args.depth, args.data_format, args.mode)
    if args.eval:
        batch = 128    # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        if args.fake:
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            logger.set_logger_dir(
                os.path.join('train_log', 'imagenet-{}-d{}'.format(args.mode, args.depth)))

        config = get_config(model, fake=args.fake)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
                        train_config=session_config,
                        load_model=args.load)
    #print(tf.test.is_gpu_available())
    #print(get_available_gpus())

    print("Net configured")

    if args.load:
        print(">>>> Loading stored model parameters.")
        # example args.load '/path/to/model/folder/model-xxxx'
        config.session_init = SaverRestore(args.load)

    if args.tot == 'train':
        if args.mp == 0:
            print("using simple trainer")
            launch_train_with_config(config, SimpleTrainer())
        else:
            print(
                "can use simple (mp=0) trainer multi gpu parameter server or replicated"
            )
            print(
                "for nccl as well as multiprocess distributed (mp=2) or multithreaded distributed (mp=else)"
            )
            if args.nccl == 0:
                print(">>>> Using " + str(args.num_gpu) +
                      " available GPU parameter server.")
                launch_train_with_config(config,
                                         SyncMultiGPUTrainer(args.num_gpu))
            elif args.num_gpu and args.nccl != 0:
                print(">>>> Using " + str(args.num_gpu) +
                      " available GPU for replicated training (nccl).")
Esempio n. 18
0
        "Note that it's best to keep per-GPU batch size in [32, 64] to obtain the best accuracy."
        "Pretrained models listed in README were trained with batch=32x8.")
    parser.add_argument('--mode',
                        choices=['resnet', 'preact', 'se'],
                        help='variants of resnet to use',
                        default='resnet')
    args = parser.parse_args()

    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    model = Model(args.depth, args.mode)
    model.data_format = args.data_format
    if args.eval:
        batch = 128  # something that can run on one gpu
        ds = get_data('val', batch)
        eval_on_ILSVRC12(model, get_model_loader(args.load), ds)
    else:
        if args.fake:
            logger.set_logger_dir(os.path.join('train_log', 'tmp'), 'd')
        else:
            logger.set_logger_dir(
                os.path.join('train_log',
                             'imagenet-{}-d{}'.format(args.mode, args.depth)))

        config = get_config(model, fake=args.fake)
        if args.load:
            config.session_init = get_model_loader(args.load)
        trainer = SyncMultiGPUTrainerReplicated(max(get_nr_gpu(), 1))
        launch_train_with_config(config, trainer)
Esempio n. 19
0
                        help='systolic array width',
                        type=int,
                        default=256)
    parser.add_argument('--accumulator-array-height',
                        help='accumulator array height',
                        type=int,
                        default=4096)
    parser.add_argument('--tensorpack-logdir-id',
                        help='TensorPack training log directory id',
                        type=str,
                        default='')
    parser.add_argument('--mpusim-logdir',
                        help='MPU simulator log directory',
                        type=str,
                        default='.')
    args = parser.parse_args()

    model = Model(args.resnet_depth, args.activations_datatype_size_byte,
                  args.weights_datatype_size_byte,
                  args.results_datatype_size_byte, args.systolic_array_height,
                  args.systolic_array_width, args.accumulator_array_height,
                  args.mpusim_logdir)

    logger.set_logger_dir(
        os.path.join(
            'train_log', 'resnext_{}{}'.format(args.resnet_depth,
                                               args.tensorpack_logdir_id)))

    config = get_config(model)
    launch_train_with_config(config, SimpleTrainer())