def main(cfg): print(cfg) tf.reset_default_graph() logger.set_logger_dir('tflogs', action='d') copyfile(hydra.utils.to_absolute_path('model.py'), 'model.py') copyfile(hydra.utils.to_absolute_path('dataflow.py'), 'dataflow.py') if cfg.cat_name == 'smpl': train_df = SMPLDataFlow(cfg, True, 1000) val_df = VisSMPLDataFlow(cfg, True, 1000, port=1080) else: train_df = ShapeNetDataFlow(cfg, cfg.data.train_txt, True) val_df = VisDataFlow(cfg, cfg.data.val_txt, False, port=1080) config = TrainConfig( model=Model(cfg), dataflow=BatchData(PrefetchData(train_df, cpu_count() // 2, cpu_count() // 2), cfg.batch_size), callbacks=[ ModelSaver(), SimpleMovingAverage(['recon_loss', 'GAN/loss_d', 'GAN/loss_g', 'GAN/gp_loss', 'symmetry_loss'], 100), PeriodicTrigger(val_df, every_k_steps=30) ], monitors=tensorpack.train.DEFAULT_MONITORS() + [ScalarPrinter(enable_step=True, enable_epoch=False)], max_epoch=10 ) launch_train_with_config(config, SimpleTrainer())
def train_vqvae(params, dataset, checkpoint_dir): logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] image_shape = model_params['image_shape'] train_ds, val_ds, sample_train, sample_test = load_toy_dataset( dataset, trainer_params['batch_size'], trainer_params['num_parallel']) params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BaseVQVAE.from_params(model_params) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ Reconstruct(model, sample_train, sample_test, os.path.join(checkpoint_dir, 'images')), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), InferenceRunner(input=val_ds, infs=ScalarStats(['loss', 'perplexity'])), MaxSaver(monitor_stat='validation_loss'), CompressResource(os.path.join(checkpoint_dir, 'images'), os.path.join(checkpoint_dir, 'images.zip')) ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, SimpleTrainer())
def main(): args = parse_args() args.seed = init_rand(seed=args.seed) _, log_file_exist = initialize_logging( logging_dir_path=args.save_dir, logging_file_name=args.logging_file_name, script_args=args, log_packages=args.log_packages, log_pip_packages=args.log_pip_packages) logger.set_logger_dir(args.save_dir) batch_size = prepare_tf_context(num_gpus=args.num_gpus, batch_size=args.batch_size) classes = 1000 net, inputs_desc = prepare_model( model_name=args.model, classes=classes, use_pretrained=args.use_pretrained, pretrained_model_file_path=args.resume.strip()) train_dataflow = get_data(is_train=True, batch_size=batch_size, data_dir_path=args.data_dir) val_dataflow = get_data(is_train=False, batch_size=batch_size, data_dir_path=args.data_dir) train_net(net=net, session_init=inputs_desc, batch_size=batch_size, num_epochs=args.num_epochs, train_dataflow=train_dataflow, val_dataflow=val_dataflow)
def train(args, logdir): # model # ;model = Net1() # dataflow # ;df = Net1DataFlow(hp.train1.data_path, hp.train1.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir) session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True, ), ) train_conf = TrainConfig( # ;model=model, # ;data=QueueInput(df(n_prefetch=1000, n_thread=4)), callbacks=[ ModelSaver(checkpoint_dir=logdir), # TODO EvalCallback() ], # ;max_epoch=hp.train1.num_epochs, # ;steps_per_epoch=hp.train1.steps_per_epoch, # session_config=session_conf ) ckpt = '{}/{}'.format( logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(','))
def train(args, logdir1, logdir2): # model model = Net2() preprocessing(data_path, logdir2) # dataflow df = Net2DataFlow(data_path, hp.train2.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir2) # session_conf = tf.ConfigProto( # gpu_options=tf.GPUOptions( # allow_growth=True, # per_process_gpu_memory_fraction=0.6, # ), # ) dataset_size = len(glob.glob(data_path + '/wav/*.wav')) print("\t\data_path : ", data_path) print("\t\tDataset Size : ", dataset_size) print("\t\tBatch Size : ", hp.train2.batch_size) print("\t\tSteps per epoch : ", (dataset_size // hp.train2.batch_size)) from time import sleep sleep(10) session_inits = [] ckpt2 = '{}/{}'.format( logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2) if ckpt2: session_inits.append(SaverRestore(ckpt2)) ckpt1 = tf.train.latest_checkpoint(logdir1) if ckpt1: session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) train_conf = AutoResumeTrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=8)), callbacks=[ # TODO save on prefix net2 ModelSaver(checkpoint_dir=logdir2), # ConvertCallback(logdir2, hp.train2.test_per_epoch), ], max_epoch=hp.train2.num_epochs, steps_per_epoch=dataset_size // hp.train2.batch_size, session_init=ChainInit(session_inits)) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) gpu_list = args.gpu.split(',') gpu_list = list(map(int, gpu_list)) #trainer = SimpleTrainer() trainer = SyncMultiGPUTrainerReplicated(gpu_list) #trainer = AsyncMultiGPUTrainer(gpu_list, False) launch_train_with_config(train_conf, trainer=trainer)
def _setup_logging(logdir, is_horovod): # Setup logging ... if is_horovod: hvd.init() if not is_horovod or hvd.rank() == 0: logger.set_logger_dir(logdir, 'd') logger.info("Environment Information:\n" + collect_env_info())
def get_avatar_synth_args(): parser = argparse.ArgumentParser() parser.add_argument('--train_dir', help='Directory of train data', default='./data/bitmoji/train') parser.add_argument('--test_dir', help='Directory of test data', default='./data/bitmoji/test') parser.add_argument('--logger_dir', help='Directory to save logs and model checkpoints', default=os.path.join('save', 'log', date_str())) parser.add_argument('--load_path', help='Path of the model checkpoint to load') parser.add_argument('--epochs', help='Number of epochs to train', default=100000, type=int) parser.add_argument('--batch_size', help='Minibatch size', default=512, type=int) parser.add_argument('--lr', help='Learning rate', default=1e-4, type=float) parser.add_argument( '--lr_decay', help='The multiple by which to decay the learning rate every epoch', default=0.96, type=float) parser.add_argument('--resume_lr', help='Resume the learning rate from the previous run', action='store_true') parser.add_argument( '--keep_prob', help='The keep probability for dropout (always 1 for testing)', default=0.5, type=float) parser.add_argument( '--summary_freq', help='Frequency (in steps) with which to write tensorboard summaries', default=100, type=int) parser.add_argument('--gpu', help='Comma separated list of GPU(s) to use', default='0') parser.add_argument('--num_threads', help='The number of threads to read and process data', default=32, type=int) args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu set_logger_dir(args.logger_dir) return args
def train(args, logdir): # model model = Net1() preprocessing(data_path) preprocessing(test_path) # dataflow df = Net1DataFlow(data_path, hp.train1.batch_size) df_test = Net1DataFlow(test_path, hp.train1.batch_size) #datas = df.get_data() #print(datas[1]) # set logger for event and model saver logger.set_logger_dir(logdir) #session_conf = tf.ConfigProto( # gpu_options=tf.GPUOptions( # allow_growth=True, # ),) # cv test code # https://github.com/tensorpack/tensorpack/blob/master/examples/boilerplate.py train_conf = AutoResumeTrainConfig( model=model, data=QueueInput(df(n_prefetch=hp.train1.batch_size * 10, n_thread=1)), callbacks=[ ModelSaver(checkpoint_dir=logdir), InferenceRunner( df_test(n_prefetch=1), ScalarStats(['net1/eval/loss', 'net1/eval/acc'], prefix='')), ], max_epoch=hp.train1.num_epochs, steps_per_epoch=hp.train1.steps_per_epoch, #session_config=session_conf ) ckpt = '{}/{}'.format( logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) num_gpu = hp.train1.num_gpu if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) num_gpu = len(args.gpu.split(',')) trainer = SyncMultiGPUTrainerReplicated(num_gpu) else: trainer = SimpleTrainer() launch_train_with_config(train_conf, trainer=trainer)
def train_pixelcnn_prior(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds, val_ds, sample_train, sample_val, sample_train_label, \ sample_val_label = get_dataflow( dataset_params['path'], False, dataset_params['train_val_split'], trainer_params['batch_size'], trainer_params['num_parallel']) vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path'] vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0], 'config.json') model_params['vqvae_model_params'] = vqvae_config_path latent_shape = model_params['latent_shape'] num_labels = model_params['num_labels'] params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BasePixelCNNPrior.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ SequentialSampling(trainer_params['num_examples_to_generate'], latent_shape, num_labels, model, os.path.join(checkpoint_dir, 'images')), Reconstruct(model, sample_train, sample_val, os.path.join(checkpoint_dir, 'images'), sample_train_label, sample_val_label), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), InferenceRunner(input=val_ds, infs=ScalarStats(['loss'])), MinSaver(monitor_stat='validation_loss'), CompressResource(os.path.join(checkpoint_dir, 'images'), os.path.join(checkpoint_dir, 'images.zip')), RestoreWeights(vqvae_checkpoint_path), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def _dataflow(self, validation: bool = False) -> DataFlow: assert self.step is not None assert isinstance(self.step, df.Dataflows) logger.set_logger_dir(self.args.save, action="k") return df.get_data( self.step, self.args.validation if self.args.validation is not None else self.args.data, self.args.batch_size, n_proc=self.args.nproc, n_gpus=get_num_gpu(), )
def train_image_embedding_softmax(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds, val_ds, _, _, _, _ = get_dataflow( dataset_params['path'], False, dataset_params['train_val_split'], trainer_params['batch_size'], trainer_params['num_parallel']) vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path'] vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0], 'config.json') model_params['vqvae_model_params'] = vqvae_config_path params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BaseImageEmbedding.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ InferenceRunner(input=val_ds, infs=[ ScalarStats('loss'), ClassificationError('correct_prediction', 'val-correct_prediction')]), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), MinSaver(monitor_stat='val-correct_prediction'), RestoreWeights(vqvae_checkpoint_path), SendStat('Training status', [ 'loss', 'accuracy', 'validation_loss', 'val-correct_prediction'], after_every=2), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def train(args, logdir1, logdir2): # model model = Net2() # dataflow df = Net2DataFlow(hp.train2.data_path, hp.train2.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir2) session_conf = tf.ConfigProto( # log_device_placement=True, allow_soft_placement=True, gpu_options=tf.GPUOptions( # allow_growth=True, per_process_gpu_memory_fraction=0.6, ), ) session_inits = [] ckpt2 = '{}/{}'.format(logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2) if ckpt2: session_inits.append(SaverRestore(ckpt2)) ckpt1 = tf.train.latest_checkpoint(logdir1) if ckpt1: session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) train_conf = TrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=4)), callbacks=[ # TODO save on prefix net2 ModelSaver(checkpoint_dir=logdir2), # ConvertCallback(logdir2, hp.train2.test_per_epoch), ], max_epoch=hp.train2.num_epochs, steps_per_epoch=hp.train2.steps_per_epoch, session_init=ChainInit(session_inits), session_config=session_conf ) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) #trainer = SyncMultiGPUTrainerParameterServer(hp.train2.num_gpu) trainer = SimpleTrainer() launch_train_with_config(train_conf, trainer=trainer)
def local_crawler_main(auto_dir, nr_gpu, launch_log_dir, n_parallel=10000, num_init_use_all_gpu=2): """ Args: auto_dir (str) : dir for looking for xxx.sh to run nr_gpu (int): Number of gpu on local contaienr launch_log_dir (str) : where the launcher logs stuff and hold tmp scripts. n_parallel (int) : maximum number of parallel jobs. num_init_use_all_gpu (int) : num of init jobs that will use all gpu """ logger.set_logger_dir(launch_log_dir, action='d') launcher = os.path.basename(os.path.normpath(launch_log_dir)) crawl_local_auto_scripts_and_launch(auto_dir, nr_gpu, launcher, n_parallel, num_init_use_all_gpu)
def train(args, logdir): # model print("####model") model = Net1() # dataflow print("####dataflow") df = Net1DataFlow(hp.Train1.data_path, hp.Train1.batch_size) # set logger for event and model saver print("####logger") logger.set_logger_dir(logdir) print("####session_conf") session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True, ), allow_soft_placement=True) print("####train_conf") train_conf = TrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=5)), callbacks=[ ModelSaver(checkpoint_dir=logdir), # TODO EvalCallback() ], max_epoch=hp.Train1.num_epochs, steps_per_epoch=hp.Train1.steps_per_epoch, session_config=session_conf) print("####ckpt") ckpt = '{}/{}'.format( logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) print("####trainer") trainer = SyncMultiGPUTrainerReplicated(hp.Train1.num_gpu) print("####launch_train_with_config") launch_train_with_config(train_conf, trainer=trainer)
def main(_): args = parse_args() # set gpu/cpu mode if int(args.gpu_id) >= 0: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id else: os.environ['CUDA_VISIBLE_DEVICES'] = '' checkpoint_dir = os.path.join('./checkpoints/', args.name) logger.set_logger_dir(checkpoint_dir) # set up deblur models M = model.DEBLUR(args) ds_train = get_data(args.dataroot, phase='train', crop_size=args.cropSize, batch_size=args.batchSize) ds_val = get_data(args.dataroot, phase='val', crop_size=args.cropSize, batch_size=args.batchSize) trainer = SeparateGANTrainer(ds_train, M, g_period=6) trainer.train_with_defaults( callbacks=[ ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), ScheduledHyperParamSetter('learning_rate', [(300, args.learning_rate), (args.max_epoch, 0)], interp='linear'), InferenceRunner(ds_val, [ ScalarStats('PSNR_BASE'), ScalarStats('PSNR_2'), ScalarStats('PSNR_IMPRO2'), ScalarStats('pixel_loss2'), ScalarStats('feature_loss2') ]) ], session_init=SaverRestore(checkpoint_dir + '/model-431249.data-00000-of-00001') if args.continue_train else None, starting_epoch=1, steps_per_epoch=args.steps_per_epoch, max_epoch=args.max_epoch)
def train_vae(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds, val_ds, sample_train, sample_val, _, _ = \ get_dataflow(dataset_params['path'], dataset_params['binarizer'], dataset_params['train_val_split'], trainer_params['batch_size'], trainer_params['num_parallel']) params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) latent_dim = model_params['latent_dim'] model = BaseVAE.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ Sampling(model, trainer_params['num_examples_to_generate'], latent_dim, os.path.join(checkpoint_dir, 'images')), Reconstruct(model, sample_train, sample_val, os.path.join(checkpoint_dir, 'images')), ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), InferenceRunner(input=val_ds, infs=ScalarStats(['avg_logpx_z', 'neg_elbo'])), MinSaver(monitor_stat='validation_neg_elbo'), CompressResource(os.path.join(checkpoint_dir, 'images'), os.path.join(checkpoint_dir, 'images.zip')), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--train_path', help='Path to train csv.', default='./data/ljspeech-processed/train.txt') parser.add_argument('--test_path', help='Path to test csv.', default='./data/ljspeech-processed/test.txt') parser.add_argument('--save_dir', help='Directory to save logs and model checkpoints', default=os.path.join('save', 'wavenet', date_str())) parser.add_argument('--load_path', help='Path of the model checkpoint to load') parser.add_argument( '--summary_freq', help= 'Frequency (in train steps) with which to write tensorboard summaries', default=20, type=int) parser.add_argument('--steps_per_epoch', help='Steps per epoch, defaults to the batch size', default=None, type=int) parser.add_argument('--skip_inferencing', help='Whether or not to skip inferencing after epochs', action='store_true') parser.add_argument('--gpu', help='Which GPU to use') parser.add_argument('--n_threads', help='The number of threads to read and process data', default=2, type=int) parser.add_argument('--resume_lr', help='Resume the learning rate from the loaded run', action='store_true') args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu set_logger_dir(args.save_dir) return args
def train(args, logdir): # model model = Net1() # dataflow TIMIT_TRAIN_WAV = 'TIMIT/TRAIN/*/*/*.npz' TIMIT_TEST_WAV = 'TIMIT/TEST/*/*/*.npz' print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV)) print(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV)) df = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TRAIN_WAV), hp.train1.batch_size) df_test = Net1DataFlow(os.path.join(hp.train1.preproc_data_path, args.case, TIMIT_TEST_WAV), hp.train1.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir) train_conf = AutoResumeTrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=8)), callbacks=[ ModelSaver(checkpoint_dir=logdir), InferenceRunner(df_test(n_prefetch=1), ScalarStats(['net1/eval/loss', 'net1/eval/acc'],prefix='')), ], max_epoch=hp.train1.num_epochs, steps_per_epoch=hp.train1.steps_per_epoch, #session_config=session_conf ) ckpt = '{}/{}'.format(logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if hp.default.use_gpu == True: os.environ['CUDA_VISIBLE_DEVICES'] = hp.default.gpu_list train_conf.nr_tower = len(hp.default.gpu_list.split(',')) num_gpu = len(hp.default.gpu_list.split(',')) trainer = SyncMultiGPUTrainerReplicated(num_gpu) else: os.environ['CUDA_VISIBLE_DEVICES'] = '' trainer = SimpleTrainer() launch_train_with_config(train_conf, trainer=trainer)
def train_image_embedding_triplet(params, checkpoint_dir, recover=True, force=False): if force and os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) logger.set_logger_dir(checkpoint_dir) dataset_params = params['dataset'] model_params = params['model'] trainer_params = params['trainer'] train_ds = get_triplet_dataflow( dataset_params['path'], trainer_params['items_per_batch'], trainer_params['images_per_item'], trainer_params['num_parallel']) vqvae_checkpoint_path = trainer_params['vqvae_checkpoint_path'] vqvae_config_path = os.path.join(os.path.split(vqvae_checkpoint_path)[0], 'config.json') model_params['vqvae_model_params'] = vqvae_config_path params.to_file(os.path.join(logger.get_logger_dir(), 'config.json')) model = BaseImageEmbedding.from_params(model_params) trainer = SyncMultiGPUTrainerParameterServer( gpus=trainer_params['num_gpus'], ps_device=None) trainer_config = AutoResumeTrainConfig( always_resume=recover, model=model, dataflow=train_ds, callbacks=[ ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir), MinSaver(monitor_stat='loss'), RestoreWeights(vqvae_checkpoint_path), SendStat('Training status', ['loss', 'pos_triplet_frac'], after_every=2), Notification('Training status', 'Complete') ], steps_per_epoch=trainer_params['steps_per_epoch'], max_epoch=trainer_params['max_epochs'] ) launch_train_with_config(trainer_config, trainer)
def train(args, logdir): # model model = Net() # dataflow df = NetDataFlow(hp.train.data_path, hp.train.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir) session_conf = tf.ConfigProto( gpu_options=tf.GPUOptions( allow_growth=True, ),) session_conf.gpu_options.per_process_gpu_memory_fraction = 0.45 # 占用GPU90%的显存 train_conf = TrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=4)), callbacks=[ ModelSaver(checkpoint_dir=logdir), # TODO EvalCallback() ], max_epoch=hp.train.num_epochs, steps_per_epoch=hp.train.steps_per_epoch, # session_config=session_conf ) ckpt = '{}/{}'.format(logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) trainer = SyncMultiGPUTrainerReplicated(hp.train.num_gpu) launch_train_with_config(train_conf, trainer=trainer)
temp = temp[keys[i]] temp[keys[-1]] = value # set GPU machine if config['gpu'] in [None, 'None', '']: os.environ['CUDA_VISIBLE_DEVICES'] = '' num_gpu = 0 else: os.environ['CUDA_VISIBLE_DEVICES'] = config['gpu'] num_gpu = max(get_num_gpu(), 1) config['num_gpu'] = num_gpu # set log directory if config['logdir'] in [None, 'None', '']: logger.auto_set_dir() else: logger.set_logger_dir('train_log/' + config['logdir'], action='d') # save configuration with open(logger.get_logger_dir() + '/config.json', 'w') as outfile: json.dump(config, outfile) # get train config train_config = get_train_config(config) # train the model if num_gpu > 1: launch_train_with_config(train_config, SyncMultiGPUTrainerReplicated(num_gpu)) else: launch_train_with_config(train_config, SimpleTrainer())
# manually build the graph with batch=1 input_desc = [ InputDesc(tf.float32, [1, 224, 224, 3], 'input'), InputDesc(tf.int32, [1], 'label') ] input = PlaceholderInput() input.setup(input_desc) with TowerContext('', is_training=False): model.build_graph(*input.get_input_tensors()) model_utils.describe_trainable_vars() tf.profiler.profile( tf.get_default_graph(), cmd='op', options=tf.profiler.ProfileOptionBuilder.float_operation()) logger.info("Note that TensorFlow counts flops in a different way from the paper.") logger.info("TensorFlow counts multiply+add as two flops, however the paper counts them " "as 1 flop because it can be executed in one instruction.") else: if args.v2: name = "ShuffleNetV2-{}x".format(args.ratio) else: name = "ShuffleNetV1-{}x-g{}".format(args.ratio, args.group) logger.set_logger_dir(os.path.join('train_log', name)) nr_tower = max(get_num_gpu(), 1) config = get_config(model, nr_tower) if args.load: config.session_init = get_model_loader(args.load) launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_tower))
kernel_initializer=tf.keras.initializers.VarianceScaling( scale=2.0, mode='fan_in'), kernel_regularizer=tf.keras.regularizers.l2(5e-5))(x) M = tf.keras.models.Model(input, x, name='resnet50') return M if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data', help='ILSVRC dataset dir') parser.add_argument('--fake', help='use fakedata to test or benchmark this model', action='store_true') args = parser.parse_args() logger.set_logger_dir(os.path.join("train_log", "imagenet-resnet-keras")) tf.keras.backend.set_image_data_format('channels_first') num_gpu = get_num_gpu() if args.fake: df_train = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False, dtype='uint8') df_val = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False) else: batch_size = TOTAL_BATCH_SIZE // num_gpu assert args.data is not None df_train = get_imagenet_dataflow(args.data, 'train', batch_size, fbresnet_augmentor(True))
x = Flatten()(x) x = Dense(1000, activation='softmax', name='fc1000', kernel_initializer=tf.keras.initializers.VarianceScaling( scale=2.0, mode='fan_in'), kernel_regularizer=tf.keras.regularizers.l2(5e-5))(x) M = tf.keras.models.Model(input, x, name='resnet50') return M if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data', help='ILSVRC dataset dir') parser.add_argument('--fake', help='use fakedata to test or benchmark this model', action='store_true') args = parser.parse_args() logger.set_logger_dir(os.path.join("train_log", "imagenet-resnet-keras")) tf.keras.backend.set_image_data_format('channels_first') nr_gpu = get_nr_gpu() if args.fake: df_train = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False, dtype='uint8') df_val = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False) else: batch_size = TOTAL_BATCH_SIZE // nr_gpu assert args.data is not None df_train = get_imagenet_dataflow( args.data, 'train', batch_size, fbresnet_augmentor(True)) df_val = get_imagenet_dataflow( args.data, 'val', batch_size, fbresnet_augmentor(False))
x = Flatten()(x) x = Dense(1000, activation='softmax', name='fc1000', kernel_initializer=tf.keras.initializers.VarianceScaling( scale=2.0, mode='fan_in'), kernel_regularizer=tf.keras.regularizers.l2(5e-5))(x) M = tf.keras.models.Model(input, x, name='resnet50') return M if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data', help='ILSVRC dataset dir') parser.add_argument('--fake', help='use fakedata to test or benchmark this model', action='store_true') args = parser.parse_args() logger.set_logger_dir("train_log/imagenet-resnet-keras") tf.keras.backend.set_image_data_format('channels_first') nr_gpu = get_nr_gpu() if args.fake: df_train = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False, dtype='uint8') df_val = FakeData([[64, 224, 224, 3], [64, 1000]], 5000, random=False) else: batch_size = TOTAL_BATCH_SIZE // nr_gpu assert args.data is not None df_train = get_imagenet_dataflow( args.data, 'train', batch_size, fbresnet_augmentor(True)) df_val = get_imagenet_dataflow( args.data, 'val', batch_size, fbresnet_augmentor(False))
def get_config(nr_tower, args): TOTAL_BATCH_SIZE = args.batch_size batchsize = TOTAL_BATCH_SIZE // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batchsize)) max_epoch = args.num_epochs lr = args.initial_learning_rate num_epochs_before_decay = args.num_epochs_before_decay decay_factor = args.decay_factor num_decay = int(max_epoch/num_epochs_before_decay) if args.dataset_mode == 'train_fine': dataset_size = 2975 elif args.dataset_mode == 'validation_fine': dataset_size = 500 elif args.dataset_mode == 'train_patches': dataset_size = 2975*8 #23800 elif args.dataset_mode == 'validation_patches': dataset_size = 500*8 elif args.dataset_mode == 'train_coarse': dataset_size = 14440 elif args.dataset_mode == 'combine_patches': dataset_size = 23800 + 3000 elif args.dataset_mode == 'combine_val_patches': dataset_size = 1000 steps_per_epoch = int(dataset_size / TOTAL_BATCH_SIZE) max_iter = max_epoch * steps_per_epoch schedule=[] if args.lr_type == 'poly': end_lr = 2e-5 for i in range(max_epoch): ep = i val = (lr - end_lr) * np.power((1 - 1.*i / num_epochs_before_decay), 0.9) + end_lr schedule.append((ep, val)) if args.lr_type == 'exponential_decay': for i in range(num_decay): ep = i * num_epochs_before_decay val = lr * np.power(decay_factor, i) schedule.append((ep, val)) model = ShuffleTensorpack(args, sub_rate=args.sub_rate, batchsize=batchsize) dataset_train = get_city_dataflow(args.dataset_mode, batchsize, args.sub_rate, is_train=True, random_crop=args.random_crop) logger.set_logger_dir(os.path.join('log', args.exp_name+'_'+str(datetime.date.today()))) checkpoint_dir = os.path.join('log', args.exp_name+'_'+str(datetime.date.today()),'save') infs = [ScalarStats(names='mean_iou', prefix='val')] # val_mean_IoU callbacks = [ PeriodicTrigger(ModelSaver(max_to_keep=5, checkpoint_dir=checkpoint_dir),every_k_steps=250), ScheduledHyperParamSetter('learning_rate', schedule=schedule), EstimatedTimeLeft(), MergeAllSummaries(period=250), ] if args.save_val_max is True: dataset_val = get_city_dataflow(args.dataset_val_mode, TOTAL_BATCH_SIZE, args.sub_rate, is_train=False, random_crop=args.random_crop) callbacks.extend([PeriodicTrigger(DataParallelInferenceRunner(dataset_val, infs, [0,1,2,3]),every_k_steps=250), PeriodicTrigger(MaxSaver(monitor_stat='val_mean_iou', checkpoint_dir=checkpoint_dir),every_k_steps=250)]) return AutoResumeTrainConfig(model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=steps_per_epoch, max_epoch=max_epoch,)
def mvsnet_main(): parser = argparse.ArgumentParser() parser.add_argument('--logdir', help='path to save model ckpt', default='.') parser.add_argument('--data', help='path to dataset', required=True) parser.add_argument('--load', help='load a model for training or evaluation') parser.add_argument('--exp_name', help='model ckpt name') parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.') parser.add_argument('--mode', '-m', help='train / val / test', choices=['train', 'val', 'test', 'fake']) parser.add_argument('--out', default='./', help='output path for evaluation and test, default to current folder') parser.add_argument('--batch', default=1, type=int, help="Batch size per tower.") parser.add_argument('--max_d', help='depth num for MVSNet', required=True, type=int) parser.add_argument('--max_h', help='depth num for MVSNet', required=True, type=int) parser.add_argument('--max_w', help='depth num for MVSNet', required=True, type=int) parser.add_argument('--interval_scale', required=True, type=float) parser.add_argument('--view_num', required=True, type=int) parser.add_argument('--refine', default=False) parser.add_argument('--feature', help='feature extraction branch', choices=['uninet', 'unet'], default='unet') parser.add_argument('--threshold', type=float) parser.add_argument('--regularize', default='3DCNN', choices=['3DCNN', 'GRU']) args = parser.parse_args() if args.feature == 'unet': feature_branch_function = unet_feature_extraction_branch else: feature_branch_function = uni_feature_extraction_branch if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if args.mode == 'train' or args.mode == 'fake': model = MVSNet(depth_num=args.max_d, bn_training=None, bn_trainable=None, batch_size=args.batch, branch_function=feature_branch_function, is_refine=args.refine, height=args.max_h, width=args.max_w, view_num=args.view_num, regularize_type=args.regularize) if args.exp_name is None: if not args.refine: exp_name = '{}-{}-b{}-{}-{}-{}-no-refine'.format(args.max_d, args.interval_scale, args.batch, os.path.basename(args.data), args.feature, datetime.datetime.now().strftime("%m%d-%H%M")) else: exp_name = '{}-{}-b{}-{}-{}-{}-refine'.format(args.max_d, args.interval_scale, args.batch, os.path.basename(args.data), args.feature, datetime.datetime.now().strftime("%m%d-%H%M")) else: exp_name = args.exp_name logger.set_logger_dir(os.path.join(args.logdir, exp_name)) config = get_train_conf(model, args) if args.load: config.session_init = get_model_loader(args.load) gpus_id = args.gpu.split(',') gpus = len(gpus_id) logger.info('num of gpus to use: {}'.format(gpus)) if gpus > 1: trainer = SyncMultiGPUTrainerParameterServer(gpus) # trainer = SyncMultiGPUTrainerReplicated(gpus, mode='cpu') else: trainer = SimpleTrainer() # trainer = SimpleTrainer() launch_train_with_config(config, trainer) elif args.mode == 'val': assert args.load, 'in eval mode, you have to specify a trained model' assert args.out, 'in eval mode, you have to specify the output dir path' logger.set_logger_dir(args.out) model = MVSNet(depth_num=args.max_d, bn_training=None, bn_trainable=None, batch_size=args.batch, branch_function=feature_branch_function, is_refine=args.refine, height=args.max_h, width=args.max_w, view_num=args.view_num) sess_init = get_model_loader(args.load) avg_loss, avg_less_three_acc, avg_less_one_acc = evaluate(model, sess_init, args) logger.info(f'val loss: {avg_loss}') logger.info(f'val less three acc: {avg_less_three_acc}') logger.info(f'val less one acc: {avg_less_one_acc}') else: # test assert args.load, 'in eval mode, you have to specify a trained model' assert args.out, 'in eval mode, you have to specify the output dir path' assert args.data, 'in eval mode, you have to specify the data dir path' logger.set_logger_dir(args.out) model = MVSNet(depth_num=args.max_d, bn_training=None, bn_trainable=None, batch_size=args.batch, branch_function=feature_branch_function, is_refine=args.refine, height=args.max_h, width=args.max_w, view_num=args.view_num, regularize_type=args.regularize) sess_init = get_model_loader(args.load) test(model, sess_init, args)
predictions = inference(pred, x_test, tta=False, mode='test') submit(predictions, fnames) else: train_df = pd.read_csv(os.path.join('/data/kaggle/HPA', 'train.csv')) num_training = len(train_df) if config.EXTRA: extra_df = pd.read_csv( os.path.join('/data/kaggle/HPA', 'HPAv18RGBY_WithoutUncertain_wodpl.csv')) num_training += len(extra_df) num_training = int(num_training * 0.85 * 0.8) print("num_training", num_training) logger.set_logger_dir(args.logdir) training_callbacks = [ ModelSaver(max_to_keep=100, keep_checkpoint_every_n_hours=1), GPUUtilizationTracker(), ] # heuristic setting for baseline # 105678 train+extra stepnum = num_training // (config.BATCH * get_nr_gpu()) + 1 max_epoch = 50 if config.FREEZE: max_epoch = 4 TRAINING_SCHEDULE = ScheduledHyperParamSetter( 'learning_rate', [(0, 1e-3)]) else:
batch = 128 # something that can run on one gpu ds = get_data('val', batch) eval_classification(model, SmartInit(args.load), ds) elif args.flops: # manually build the graph with batch=1 with TowerContext('', is_training=False): model.build_graph( tf.placeholder(tf.float32, [1, 224, 224, 3], 'input'), tf.placeholder(tf.int32, [1], 'label') ) model_utils.describe_trainable_vars() tf.profiler.profile( tf.get_default_graph(), cmd='op', options=tf.profiler.ProfileOptionBuilder.float_operation()) logger.info("Note that TensorFlow counts flops in a different way from the paper.") logger.info("TensorFlow counts multiply+add as two flops, however the paper counts them " "as 1 flop because it can be executed in one instruction.") else: if args.v2: name = "ShuffleNetV2-{}x".format(args.ratio) else: name = "ShuffleNetV1-{}x-g{}".format(args.ratio, args.group) logger.set_logger_dir(os.path.join('train_log', name)) nr_tower = max(get_num_gpu(), 1) config = get_config(model, nr_tower) config.session_init = SmartInit(args.load) launch_train_with_config(config, SyncMultiGPUTrainerParameterServer(nr_tower))
hp.set_hparam_yaml(case) if r: remove_all_files(hp.logdir) # model model = IAFVocoder(batch_size=hp.train.batch_size, length=hp.signal.length) # dataset dataset = Dataset(hp.data_path, hp.train.batch_size, length=hp.signal.length) print('dataset size is {}'.format(len(dataset.wav_files))) # set logger for event and model saver logger.set_logger_dir(hp.logdir) train_conf = TrainConfig( model=model, data=TFDatasetInput(dataset()), callbacks=[ ModelSaver(checkpoint_dir=hp.logdir), RunUpdateOps() # for batch norm, exponential moving average # TODO GenerateCallback() ], max_epoch=hp.train.num_epochs, steps_per_epoch=hp.train.steps_per_epoch, ) ckpt = '{}/{}'.format( hp.logdir, ckpt) if ckpt else tf.train.latest_checkpoint(hp.logdir) if ckpt:
parser.add_argument('--num-layers', type=int, default=2) parser.add_argument('--batch-size', type=int, default=20) parser.add_argument('--keep-prob', type=float, default=0.5) parser.add_argument('--init-lr', type=float, default=1.0) # parser.add_argument('--warmup-epochs', type=int, default=6) parser.add_argument('--epochs', type=int, default=40) parser.add_argument('--vocab-size', type=int) global args args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) config = get_config() config.session_init = SmartInit(args.load) global trainer trainer = HorovodTrainer() if not trainer.is_chief: os.environ['WANDB_MODE'] = 'dryrun' elif not args.logdir: logger.auto_set_dir(action="d") else: logger.set_logger_dir(args.logdir, action="d") wandb_id = os.environ.get('WANDB_ID', None) if wandb_id is None: wandb.init(config=vars(args)) else: wandb.init(config=vars(args), id=f"{wandb_id}{trainer._rank}") wandb.config.update({'SLURM_JOB_ID': os.environ.get('SLURM_JOB_ID', None)}) wandb.tensorboard.patch(save=False) launch_train_with_config(config, HorovodTrainer())
def get_config(train_or_test, train_config=None, load_model=None): isTrain = train_or_test == 'train' if args.model_name is None: args.model_name = "no_name" log_dir = 'train_log/' + args.model_name logger.set_logger_dir(log_dir, 'n') dataset_train = 1 dataset_val = None steps_per_epoch = 0 # prepare dataset # dataflow structure [im, label] in parralel if isTrain: print(">>>>>> Loading training and validation sets") dataset_train = get_data('train', image_size=args.image_size, scale_size=args.scale_size, scale=args.scale, multi_crop=args.multi_crop, crop_per_case=args.crop_per_case, normalize=args.aug_norm, shuffle=True) steps_per_epoch = dataset_train.size( ) #/args.num_gpu if (args.mp != 0 or args.mp != 1) else dataset_train.size()# = |data|/(batch size * num gpu) dataset_val = get_data('val', image_size=args.image_size, scale_size=args.scale_size, scale=args.scale, multi_crop=args.multi_crop, crop_per_case=args.crop_per_case, normalize=args.aug_norm, shuffle=False) drop_rate = args.drop_out if args.drop_out is not None else 0.0 print(" >>>>>>>>>> Steps Per Epoch: ", steps_per_epoch) print(">>>>>> Constructing Neural Network...") denseModel = Model(depth=args.depth, image_size=args.scale_size, lr_init=args.lr_init, kernels=args.kernels, kernel_size=args.kernel_size, expansion=args.expansion, class_0=args.class_0, class_1=args.class_1, drop_rate=drop_rate, drop_pattern=args.drop_pattern, bn_momentum=args.bn_momentum, skip_norm=args.skip_norm, train_or_test=isTrain) if isTrain: print( "Setting up training configuration: callbacks, validation checks and hyperparameter scheduling." ) return TrainConfig( dataflow=dataset_train, callbacks=[ MovingAverageSummary(), ModelSaver(), # Record state graph at intervals during epochs InferenceRunner( input=dataset_val, infs=[ScalarStats('cost'), ClassificationError()], ), MinSaver( 'validation_error' ), #save model with min val-error, must be after inference #ScheduledHyperParamSetter('learning_rate', # [(args.drop_0, args.scale_lr*args.lr_0), # (args.drop_1, args.scale_lr*args.lr_1)]), #HyperParamSetterWithFunc('learning_rate', # lambda e, x: x * float(0.1) if e % 15 == 0 and e > args.drop_2 else x),# (1+e)/(2*20) #ScheduledHyperParamSetter('learning_rate',[(args.drop_0, args.scale_lr*args.lr_0), (args.drop_1, args.scale_lr*args.lr_1), (args.drop_2, args.scale_lr*args.lr_2), (args.drop_3, args.scale_lr*args.lr_3)]), # denote current hyperparameter) StatMonitorParamSetter('learning_rate', 'validation_error', lambda x: x * 0.1, threshold=1e-15, last_k=20), MergeAllSummaries() ], model=denseModel, session_creator=None, session_config=train_config, steps_per_epoch=steps_per_epoch, max_epoch=args.max_epoch, ) else: """ Predictive model configuration for testing and classifying. """ class TestParamSetter(Callback): #def _before_run(self, _): # return tf.train.SessionRunArgs(fetches=[],feed_dict={'PlaceholderWithDefault_1:0':1.0, 'PlaceholderWithDefault_2:0':False})#'drop_rate:0':1, 'train_or_test:0':False def _setup_graph(self): self._drop_rate = [ k for k in tf.global_variables() if k.name == 'PlaceholderWithDefault_1:0' ][0] self._train_or_test = [ k for k in tf.global_variables() if k.name == 'PlaceholderWithDefault_2:0' ][0] def _trigger_step(self): self._drop_rate.load(1.0) self._train_or_test.load(False) print(">>>>>> Constructing prediction variables.") return PredictConfig( model=denseModel, input_names=['input', 'label'], #denseModel._get_inputs(), output_names=[ 'output', 'train_error', 'cross_entropy_loss', 'input' ], )
def get_s2b_args(): parser = argparse.ArgumentParser() parser.add_argument('--train_dir_bitmoji', help='Directory of bitmoji train data', default='./data/bitmoji/train') parser.add_argument('--test_dir_bitmoji', help='Directory of bitmoji test data', default='./data/bitmoji/test') parser.add_argument('--train_dir_face', help='Directory of real face train data', default='./data/celeba/train') parser.add_argument('--test_dir_face', help='Directory of real face test data', default='./data/celeba/test') parser.add_argument('--logger_dir', help='Directory to save logs and model checkpoints', default=os.path.join('save', 's2b', date_str())) parser.add_argument('--load_path', help='Path of the model checkpoint to load', default=os.path.join('save', 's2b', 'default', 'model')) parser.add_argument('--epochs', help='Number of epochs to train', default=100000, type=int) parser.add_argument('--batch_size', help='Minibatch size', default=128, type=int) parser.add_argument('--lr', help='Learning rate', default=1e-4, type=float) parser.add_argument( '--decay', help= 'The multiple by which to decay learning rate, instance noise stddev ' 'and discriminator uncertainty threshhold every epoch', default=0.98, type=float) parser.add_argument('--resume_lr', help='Resume the learning rate from the previous run', action='store_true') parser.add_argument( '--keep_prob', help='The keep probability for dropout (always 1 for testing)', default=0.5, type=float) parser.add_argument( '--summary_freq', help='Frequency (in steps) with which to write tensorboard summaries', default=20, type=int) parser.add_argument('--gpu', help='Which GPU to use') parser.add_argument('--num_threads', help='The number of threads to read and process data', default=32, type=int) args = parser.parse_args() if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu set_logger_dir(args.logger_dir) return args