def train(args, logdir1, logdir2): # model model = Net2() # dataflow df = Net2DataFlow(hp.train2.data_path, hp.train2.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir2) session_conf = tf.ConfigProto( # log_device_placement=True, allow_soft_placement=True, gpu_options=tf.GPUOptions( # allow_growth=True, per_process_gpu_memory_fraction=0.6, ), ) session_inits = [] ckpt2 = '{}/{}'.format(logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2) if ckpt2: session_inits.append(SaverRestore(ckpt2)) ckpt1 = tf.train.latest_checkpoint(logdir1) if ckpt1: session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) train_conf = TrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=4)), callbacks=[ # TODO save on prefix net2 ModelSaver(checkpoint_dir=logdir2), # ConvertCallback(logdir2, hp.train2.test_per_epoch), ], max_epoch=hp.train2.num_epochs, steps_per_epoch=hp.train2.steps_per_epoch, session_init=ChainInit(session_inits), session_config=session_conf ) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) #trainer = SyncMultiGPUTrainerParameterServer(hp.train2.num_gpu) trainer = SimpleTrainer() launch_train_with_config(train_conf, trainer=trainer)
def train(args, logdir): # model print("####model") model = Net1() # dataflow print("####dataflow") df = Net1DataFlow(hp.Train1.data_path, hp.Train1.batch_size) # set logger for event and model saver print("####logger") logger.set_logger_dir(logdir) print("####session_conf") session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True, ), allow_soft_placement=True) print("####train_conf") train_conf = TrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=5)), callbacks=[ ModelSaver(checkpoint_dir=logdir), # TODO EvalCallback() ], max_epoch=hp.Train1.num_epochs, steps_per_epoch=hp.Train1.steps_per_epoch, session_config=session_conf) print("####ckpt") ckpt = '{}/{}'.format( logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) print("####trainer") trainer = SyncMultiGPUTrainerReplicated(hp.Train1.num_gpu) print("####launch_train_with_config") launch_train_with_config(train_conf, trainer=trainer)
def train(args, logdir): # model model = Net() # dataflow df = NetDataFlow(hp.train.data_path, hp.train.batch_size) # set logger for event and model saver logger.set_logger_dir(logdir) session_conf = tf.ConfigProto( gpu_options=tf.GPUOptions( allow_growth=True, ),) session_conf.gpu_options.per_process_gpu_memory_fraction = 0.45 # 占用GPU90%的显存 train_conf = TrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=4)), callbacks=[ ModelSaver(checkpoint_dir=logdir), # TODO EvalCallback() ], max_epoch=hp.train.num_epochs, steps_per_epoch=hp.train.steps_per_epoch, # session_config=session_conf ) ckpt = '{}/{}'.format(logdir, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) trainer = SyncMultiGPUTrainerReplicated(hp.train.num_gpu) launch_train_with_config(train_conf, trainer=trainer)
def train(args, logdir2): # model model = Net2() # dataflow df = Net2DataFlow(hp.train2.mel_path, hp.train2.ppgs_path, hp.train2.batch_size) session_inits = [] ckpt2 = '{}/{}'.format( logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2) if ckpt2: session_inits.append(SaverRestore(ckpt2)) ''' ckpt1 = tf.train.latest_checkpoint(logdir1) if ckpt1: session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) ''' train_conf = TrainConfig( model=model, data=QueueInput(df(n_prefetch=1000, n_thread=4)), callbacks=[ # TODO save on prefix net2 ModelSaver(checkpoint_dir=logdir2), # ConvertCallback(logdir2, hp.train2.test_per_epoch), ], max_epoch=hp.train2.num_epochs, steps_per_epoch=hp.train2.steps_per_epoch, session_init=ChainInit(session_inits)) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) trainer = SyncMultiGPUTrainerReplicated(hp.train2.num_gpu) print("strated trainer") launch_train_with_config(train_conf, trainer=trainer)
# dataset dataset = Dataset(hp.data_path, hp.train.batch_size, length=hp.signal.length) print('dataset size is {}'.format(len(dataset.wav_files))) # set logger for event and model saver logger.set_logger_dir(hp.logdir) train_conf = TrainConfig( model=model, data=TFDatasetInput(dataset()), callbacks=[ ModelSaver(checkpoint_dir=hp.logdir), RunUpdateOps() # for batch norm, exponential moving average # TODO GenerateCallback() ], max_epoch=hp.train.num_epochs, steps_per_epoch=hp.train.steps_per_epoch, ) ckpt = '{}/{}'.format( hp.logdir, ckpt) if ckpt else tf.train.latest_checkpoint(hp.logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, gpu)) train_conf.nr_tower = len(gpu) if hp.train.num_gpu <= 1:
hp.train.tar_labels, hp.train.ntar_labels, length=hp.signal.length, tar_ratio=hp.train.tar_ratio) # set logger for event and model saver logger.set_logger_dir(hp.logdir) model = globals()[hp.model]() print("Model name: {}".format(hp.model)) train_conf = TrainConfig( model=model, data=TFDatasetInput(dataset.train.get_dataset()), callbacks=[ ModelSaver(checkpoint_dir=hp.logdir), EvalCallback(), # RunUpdateOps() # enable this when using batch normalization. ], max_epoch=hp.train.num_epochs, steps_per_epoch=hp.train.steps_per_epoch, ) ckpt = '{}/{}'.format( hp.logdir, ckpt) if ckpt else tf.train.latest_checkpoint(hp.logdir) if ckpt: train_conf.session_init = SaverRestore(ckpt) if gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, gpu)) train_conf.nr_tower = len(gpu) if hp.train.num_gpu <= 1:
# dataflow audio_meta = AudioMeta(hp.train.data_path) if args.remote: df = get_remote_dataflow(args.port, hp.train.batch_size) else: df = DataLoader(audio_meta, hp.train.batch_size).dataflow(nr_prefetch=5000, nr_thread=int(multiprocessing.cpu_count() // 1.5)) # set logger for event and model saver logger.set_logger_dir(hp.logdir) if True: train_conf = TrainConfig( model=ClassificationModel(num_classes=audio_meta.num_speaker, **hp.model), data=FlexibleQueueInput(df, capacity=500), callbacks=[ ModelSaver(checkpoint_dir=hp.logdir), EvalCallback() ], steps_per_epoch=hp.train.steps_per_epoch, # session_config=session_config ) ckpt = args.ckpt if args.ckpt else tf.train.latest_checkpoint(hp.logdir) if ckpt and not args.r: train_conf.session_init = SaverRestore(ckpt) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_conf.nr_tower = len(args.gpu.split(',')) trainer = SyncMultiGPUTrainerReplicated(hp.train.num_gpu)