def train_vqvae(params, dataset, checkpoint_dir): logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] image_shape = model_params['image_shape'] train_ds, val_ds, sample_train, sample_test = load_toy_dataset( dataset, trainer_params['batch_size'], trainer_params['num_parallel']) params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BaseVQVAE.from_params(model_params) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ Reconstruct(model, sample_train, sample_test, os.path.join(checkpoint_dir, 'images')), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), InferenceRunner(input=val_ds, infs=ScalarStats(['loss', 'perplexity'])), MaxSaver(monitor_stat='validation_loss'), CompressResource(os.path.join(checkpoint_dir, 'images'), os.path.join(checkpoint_dir, 'images.zip')) ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, SimpleTrainer())
def get_config(model): nr_tower = max(get_num_gpu(), 1) batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) callbacks = [ThroughputTracker(args.batch)] if args.fake: data = QueueInput(FakeData( [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')) else: data = QueueInput( get_imagenet_dataflow(args.data, 'train', batch), # use a larger queue queue=tf.FIFOQueue(200, [tf.uint8, tf.int32], [[batch, 224, 224, 3], [batch]]) ) BASE_LR = 30 SCALED_LR = BASE_LR * (args.batch / 256.0) callbacks.extend([ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', [ (0, SCALED_LR), (60, SCALED_LR * 1e-1), (70, SCALED_LR * 1e-2), (80, SCALED_LR * 1e-3), (90, SCALED_LR * 1e-4), ]), ]) dataset_val = get_imagenet_dataflow(args.data, 'val', 64) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) if args.load.endswith(".npz"): # a released model in npz format init = SmartInit(args.load) else: # a pre-trained checkpoint init = SaverRestore(args.load, ignore=("learning_rate", "global_step")) return TrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1281167 // args.batch, session_init=init, max_epoch=100, )
def train_pixelcnn_prior(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds, val_ds, sample_train, sample_val, sample_train_label, \ sample_val_label = get_dataflow( dataset_params['path'], False, dataset_params['train_val_split'], trainer_params['batch_size'], trainer_params['num_parallel']) vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path'] vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0], 'config.json') model_params['vqvae_model_params'] = vqvae_config_path latent_shape = model_params['latent_shape'] num_labels = model_params['num_labels'] params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BasePixelCNNPrior.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ SequentialSampling(trainer_params['num_examples_to_generate'], latent_shape, num_labels, model, os.path.join(checkpoint_dir, 'images')), Reconstruct(model, sample_train, sample_val, os.path.join(checkpoint_dir, 'images'), sample_train_label, sample_val_label), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), InferenceRunner(input=val_ds, infs=ScalarStats(['loss'])), MinSaver(monitor_stat='validation_loss'), CompressResource(os.path.join(checkpoint_dir, 'images'), os.path.join(checkpoint_dir, 'images.zip')), RestoreWeights(vqvae_checkpoint_path), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def _inference_runner_train_cbs(args, ds_val, insrc_val, val_cbs): train_cbs = [] if args.do_remote_child_inf_runner: if args.num_classes > 1: val_cbs.extend( generate_classification_callbacks(args.net_info.master)) else: val_cbs.extend(generate_regression_callbacks(args.net_info.master)) inf_runner = InferenceRunner(ds_val or insrc_val, [ScalarStats('cost')] + val_cbs) train_cbs.append(inf_runner) return train_cbs
def train(args, logdir): # model model = Net1() preprocessing(data_path) preprocessing(test_path) # dataflow df = Net1DataFlow(data_path, hp.train1.batch_size) df_test = Net1DataFlow(test_path, hp.train1.batch_size) #datas = df.get_data() #print(datas[1]) # set logger for event and model saver logger.set_logger_dir(logdir) #session_conf = tf.ConfigProto( # gpu_options=tf.GPUOptions( # allow_growth=True, # ),) # cv test code # https://github.com/tensorpack/tensorpack/blob/master/examples/boilerplate.py train_conf = AutoResumeTrainConfig( model=model, data=QueueInput(df(n_prefetch=hp.train1.batch_size * 10, n_thread=1)), callbacks=[ ModelSaver(checkpoint_dir=logdir), InferenceRunner( df_test(n_prefetch=1), ScalarStats(['net1/eval/loss', 'net1/eval/acc'], prefix='')), ], max_epoch=hp.train1.num_epochs, steps_per_epoch=hp.train1.steps_per_epoch, #session_config=session_conf ) ckpt = '{}/{}'.format( logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) num_gpu = hp.train1.num_gpu if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) num_gpu = len(args.gpu.split(',')) trainer = SyncMultiGPUTrainerReplicated(num_gpu) else: trainer = SimpleTrainer() launch_train_with_config(train_conf, trainer=trainer)
def train_image_embedding_softmax(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds, val_ds, _, _, _, _ = get_dataflow( dataset_params['path'], False, dataset_params['train_val_split'], trainer_params['batch_size'], trainer_params['num_parallel']) vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path'] vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0], 'config.json') model_params['vqvae_model_params'] = vqvae_config_path params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BaseImageEmbedding.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ InferenceRunner(input=val_ds, infs=[ ScalarStats('loss'), ClassificationError('correct_prediction', 'val-correct_prediction')]), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), MinSaver(monitor_stat='val-correct_prediction'), RestoreWeights(vqvae_checkpoint_path), SendStat('Training status', [ 'loss', 'accuracy', 'validation_loss', 'val-correct_prediction'], after_every=2), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def train_vae(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds, val_ds, sample_train, sample_val, _, _ = \ get_dataflow(dataset_params['path'], dataset_params['binarizer'], dataset_params['train_val_split'], trainer_params['batch_size'], trainer_params['num_parallel']) params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) latent_dim = model_params['latent_dim'] model = BaseVAE.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ Sampling(model, trainer_params['num_examples_to_generate'], latent_dim, os.path.join(checkpoint_dir, 'images')), Reconstruct(model, sample_train, sample_val, os.path.join(checkpoint_dir, 'images')), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), InferenceRunner(input=val_ds, infs=ScalarStats(['avg_logpx_z', 'neg_elbo'])), MinSaver(monitor_stat='validation_neg_elbo'), CompressResource(os.path.join(checkpoint_dir, 'images'), os.path.join(checkpoint_dir, 'images.zip')), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def ptb_training_cbs(model, args, ptb_data_dir, train_cbs): # compute some callbacks for training # shift_state_callback_train = PerStepHookWithControlDependencies( # op_func=lambda : model.update_state(), # dependencies_func=lambda self : [self.trainer.train_op] # ) #train_cbs.append(shift_state_callback_train) train_cbs.append(RunOp(lambda: model.reset_state())) if args.training_type in ['tensorpack', 'petridish']: train_cbs.append( HyperParamSetterWithFunc('learning_rate', lambda e, x: x * 0.80 if e > 6 else x)) if args.training_type in ['tensorpack', 'petridish', 'darts_final']: # TODO keep these for now for debugging; # remove for search l_splits = ['valid', 'test'] for split in l_splits: data = PennTreeBankDataFlow(split, ptb_data_dir, args.batch_size, args.model_rnn_max_len, var_size=False) #shift_state_inf = PerStepInferencer( # op_func=lambda : model.inference_update_tensor(name_only=True)) inferencer = InferenceRunner( data, [ ScalarStats(['avg_batch_cost', 'seq_len'], prefix=split), #shift_state_inf ], tower_name='InferenceTower_{}'.format(split)) reset_state_cb = RunOp(lambda: model.reset_state()) train_cbs.extend([inferencer, reset_state_cb]) print_cb = CallbackFactory(trigger=lambda self: [ self.trainer.monitors.put_scalar( '{}_perplexity'.format(split), np.exp((self.trainer.monitors.get_latest( '{}_avg_batch_cost'.format(split)) / self.trainer.monitors. get_latest('{}_seq_len'.format(split))))) for split in l_splits ]) train_cbs.append(print_cb) return train_cbs
def train(args, logdir): # model model = Net1() # dataflow TIMIT_TRAIN_WAV = 'TIMIT/TRAIN/*/*/*.npz' TIMIT_TEST_WAV = 'TIMIT/TEST/*/*/*.npz' print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV)) print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV)) df = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV), hp.train1.batch_size) df_test = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV), hp.train1.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir) train_conf = AutoResumeTrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=8)), callbacks=[ ModelSaver(checkpoint_dir=logdir), InferenceRunner(df_test(n_prefetch=1), ScalarStats(['net1/eval/loss', 'net1/eval/acc'],prefix='')), ], max_epoch=hp.train1.num_epochs, steps_per_epoch=hp.train1.steps_per_epoch, #session_config=session_conf ) ckpt = '{}/{}'.format(logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if hp.default.use_gpu == True: os.environ['CUDA_VISIBLE_DEVICES'] = hp.default.gpu_list train_conf.nr_tower = len(hp.default.gpu_list.split(',')) num_gpu = len(hp.default.gpu_list.split(',')) trainer = SyncMultiGPUTrainerReplicated(num_gpu) else: os.environ['CUDA_VISIBLE_DEVICES'] = '' trainer = SimpleTrainer() launch_train_with_config(train_conf, trainer=trainer)
def _default_callbacks(self): self.callbacks = [ ModelSaver(max_to_keep=self.args.max_to_keep), EstimatedTimeLeft(), ] if self.args.gpu and self.args.gpu != "-1": self.callbacks.append(GPUUtilizationTracker()) if self.args.validation is not None: self.callbacks.append( InferenceRunner(self.dataflow(True), [ScalarStats(self.total_cost_var)])) self.callbacks.append( MinSaver(self.validation_total_cost_var if self.args. validation is not None else self.total_cost_var)) self._network_specific_callbacks()
M.add(KL.Conv2D(32, 3, activation='relu', padding='same')) M.add(KL.MaxPooling2D()) M.add(KL.Conv2D(32, 3, padding='same', activation='relu')) M.add(KL.Flatten()) M.add( KL.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(1e-5))) M.add(KL.Dropout(0.5)) M.add( KL.Dense(10, activation=None, kernel_regularizer=regularizers.l2(1e-5))) M.add(KL.Activation('softmax')) trainer = SimpleTrainer() setup_keras_trainer(trainer, model=M, input=QueueInput(dataset_train), optimizer=tf.train.AdamOptimizer(1e-3), loss='categorical_crossentropy', metrics=['accuracy']) trainer.train_with_defaults( callbacks=[ ModelSaver(), InferenceRunner(dataset_test, [ScalarStats(['total_loss', 'accuracy'])]), ], steps_per_epoch=dataset_train.size(), )
def critic_train(ctrl, data, log_dir, model_dir, prev_dir, vs_name, split_train_val=False): if not os.path.exists(model_dir): os.makedirs(model_dir) lr_schedule = [] max_epoch = ctrl.critic_train_epoch lr = ctrl.critic_init_lr for epoch in range(0, max_epoch): if epoch % 1 == 0: lr_schedule.append((epoch + 1, lr)) lr *= 0.9 ds_size = len(data[0]) idxs = list(range(ds_size)) np.random.shuffle(idxs) if split_train_val: train_size = ds_size * 9 // 10 if train_size == 0: train_size = ds_size val_start = train_size else: train_size = ds_size val_start = ds_size * 9 // 10 if ds_size - val_start == 0: val_start = 0 data_train = [[col[k] for k in idxs[:train_size]] for col in data] data_val = [[col[k] for k in idxs[val_start:]] for col in data] model = critic_factory(ctrl, is_train=True, vs_name=vs_name) ds_train = critic_dataflow_factory(ctrl, data_train, is_train=True) ds_val = critic_dataflow_factory(ctrl, data_val, is_train=False) session_config = None device = 0 if ctrl.critic_type == CriticTypes.LSTM: session_config = tf.ConfigProto(device_count={'GPU': 0}) device = -1 extra_callbacks = DEFAULT_CALLBACKS() extra_callbacks = list( filter(lambda x: not isinstance(x, ProgressBar), extra_callbacks)) logger.info("Extra callbacks are {}".format( list(map(lambda x: x.__class__, extra_callbacks)))) # Put this into callbacks for in-training validation/inferencing inference_callback = InferenceRunner( ds_val, [ScalarStats('{}/cost'.format(vs_name))], device=device) config = TrainConfig( dataflow=ds_train, callbacks=[ ModelSaver(checkpoint_dir=model_dir, max_to_keep=1, keep_checkpoint_every_n_hours=100), ScheduledHyperParamSetter('learning_rate', lr_schedule) ], extra_callbacks=extra_callbacks, model=model, monitors=[JSONWriter(), ScalarPrinter()], #, TFEventWriter()], steps_per_epoch=ds_train.size(), max_epoch=max_epoch, session_config=session_config) ckpt = tf.train.latest_checkpoint(prev_dir if prev_dir else model_dir) if ckpt: config.session_init = SaverRestore(ckpt) launch_train_with_config(config, SimpleTrainer())