def initialize(self): # init_op = tf.initialize_all_variables() # init_op = tf.global_variables_initializer() # sess = tf.Session() # sess.run(init_op) # Check if GPUs are available # if tf.test.is_gpu_available(): # commented out since this test will cause a new session be created # allow growth # config = tf.compat.v1.ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction = 1 # config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU # # config.log_device_placement = True # to log device placement (on which device the operation ran) # sess = tf.compat.v1.Session(config=config) # tf.compat.v1.keras.backend.set_session(sess) # set this TensorFlow session as the default session for Keras # Create logger self.logger = logging.getLogger('DeepGalaxyTrain') self.logger.setLevel(self.log_level) self.logger.addHandler(logging.FileHandler('train_log.txt')) if self.distributed_training is True: try: import horovod.tensorflow.keras as hvd # initialize horovod hvd.init() self.callbacks.append( hvd.callbacks.BroadcastGlobalVariablesCallback(0)) self.callbacks.append(hvd.callbacks.MetricAverageCallback()) # self.callbacks = [hvd.BroadcastGlobalVariablesHook(0)] if hvd.rank() == 0: self.logger.info('Parallel training enabled.') self.logger.info( 'batch_size = %d, global_batch_size = %d, num_workers = %d\n' % (self.batch_size, self.batch_size * hvd.size(), hvd.size())) # Map an MPI process to a GPU (Important!) print('hvd_rank = %d, hvd_local_rank = %d' % (hvd.rank(), hvd.local_rank())) self.logger.info('hvd_rank = %d, hvd_local_rank = %d' % (hvd.rank(), hvd.local_rank())) # Bind a CUDA device to one MPI process (has no effect if GPUs are not used) os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) # # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # if gpus: # tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') except ImportError as identifier: print( 'Error importing horovod. Disabling distributed training.') self.distributed_training = False else: self.logger.info('Parallel training disabled.') self.logger.info('Batch_size = %d' % (self.batch_size))
def construct_dataset(filenames, batch_size, n_epochs, sample_shape, rank=0, n_ranks=1, shard=True, shuffle=False, local_fs=False, shuffle_buffer_size=128): # Define the dataset from the list of files data = tf.data.Dataset.from_tensor_slices(filenames) if (shard and local_fs): local_rank = int(hvd.local_rank()) local_size = int(hvd.local_size()) data = data.shard(num_shards=local_size, index=local_rank) elif (shard): data = data.shard(num_shards=n_ranks, index=rank) if shuffle: data = data.shuffle(len(filenames), reshuffle_each_iteration=True) # Parse TFRecords parse_data = partial(_parse_data, shape=sample_shape) data = data.apply(tf.data.TFRecordDataset).map(parse_data, num_parallel_calls=4) # Localized sample shuffling (note: imperfect global shuffling) if shuffle: data = data.shuffle(shuffle_buffer_size) data = data.repeat(n_epochs) data = data.batch(batch_size, drop_remainder=True) return data.prefetch(4)
def _get_hooks(is_distributed=_DISTRIBUTED, verbose=1): logger = _get_logger() if is_distributed: logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size())) return [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard, or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback( warmup_epochs=_WARMUP_EPOCHS, verbose=verbose), # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs. hvd.callbacks.LearningRateScheduleCallback( start_epoch=_WARMUP_EPOCHS, end_epoch=30, multiplier=1.0), hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1), hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2), hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3), ] else: return []
def __init__(self, *args, **kwargs): super(KerasTests, self).__init__(*args, **kwargs) warnings.simplefilter('module') hvd.init() self.config = tf.ConfigProto() self.config.gpu_options.allow_growth = True self.config.gpu_options.visible_device_list = str(hvd.local_rank())
def init_horovod(self): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config))
def _get_runconfig(is_distributed=_DISTRIBUTED): if is_distributed: # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) else: config = tf.ConfigProto() config.gpu_options.allow_growth = True return config
def train_evaluate(): # Generate training and validation data generators def get_image_list(data_dir): dataset = [] for folder in os.listdir(data_dir): for image in os.listdir(os.path.join(data_dir, folder)): dataset.append((os.path.join(data_dir, folder, image), folder)) return dataset training_data = ImageSequence(get_image_list(os.path.join(FLAGS.data_dir, 'train')), FLAGS.batch_size, True) validation_data = ImageSequence(get_image_list(os.path.join(FLAGS.data_dir, 'test')), FLAGS.batch_size, False) # Horovod: Initialize Horovod hvd.init() # Horvod: Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.keras.backend.set_session(tf.Session(config=config)) # Create a model model = network_model(FLAGS.hidden_units) loss = 'categorical_crossentropy' # Horovod: Adjust learning rate based on number of GPUs optimizer = Adadelta(lr=1.0 * hvd.size()) # Horovod: add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer(optimizer) metrics = ['acc'] model.compile(optimizer, loss, metrics) # Set up callbacks callbacks = [ # Broadcast initial variable states from rank 0 to all other processes hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] # Horovod: save logs only on worker 0 if hvd.rank() == 0: callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=FLAGS.log_dir)) # Start training model.fit_generator(generator = training_data, validation_data = validation_data, epochs = FLAGS.epochs, use_multiprocessing = True, workers = 4, callbacks = callbacks, verbose = 1) # Save the model model.save(FLAGS.save_model_path)
def _main(): hvd.init() better_exceptions.MAX_LENGTH = 128 _MODELS_DIR.mkdir(parents=True, exist_ok=True) logger = tk.log.get() logger.addHandler(tk.log.stream_handler()) if hvd.rank() == 0: logger.addHandler( tk.log.file_handler(_MODELS_DIR / 'train.log', append=True)) with tk.dl.session( gpu_options={'visible_device_list': str(hvd.local_rank())}): _run()
def init(global_batch_size, max_gpu_batch_size, gpus=runai.utils.gpus.count()): if gpus < 1: raise ValueError('GPU count (%d) must be at least 1' % gpus) module = sys.modules[__name__] setattr(module, 'global_batch_size', global_batch_size) setattr(module, 'gpus', gpus) setattr(module, 'master', True) # TODO(levosos): support uneven dividing steps = max(1, global_batch_size // (max_gpu_batch_size * gpus)) # must be at least 1 batch_size = global_batch_size // (steps * gpus) setattr(module, 'steps', steps) setattr(module, 'batch_size', batch_size) runai.utils.log.info( 'Spreading global batch size %d across %d GPU(s) each with %d step(s) of batch size %d', global_batch_size, gpus, steps, batch_size) if gpus > 1: runai.utils.log.debug('Initializing Horovod') import horovod.keras as hvd hvd.init() setattr(module, 'master', hvd.local_rank() == 0) setattr(module, 'hvd', hvd) # so that anyone will be easily accessible to Horovod runai.utils.log.debug('Attaching Keras session to GPU #%d', hvd.local_rank()) import tensorflow config = tensorflow.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) import keras.backend keras.backend.set_session( tensorflow.Session(config=config) ) # TODO(levosos): support cases where configuration will be set afterwards
def main(args): # ===========変更点============= # import horovod.keras as hvd hvd.init() config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) logging.info("getting data") train_dataset = train_input_fn() eval_dataset = eval_input_fn() validation_dataset = validation_input_fn() # ===============変更点======================= # logging.info("configuring model") model = keras_model_fn(args.learning_rate, args.weight_decay, args.optimizer, args.momentum, hvd) callbacks = [] # ===============変更点======================= # # callbacks.append(ModelCheckpoint(args.model_dir + '/checkpoint-{epoch}.h5')) callbacks.append(ModelCheckpoint(args.model_output_dir + '/checkpoint-{epoch}.h5')) # ===============変更点======================= # callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvd.callbacks.MetricAverageCallback()) callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) # ===============変更点======================= # if hvd.rank() == 0: callbacks.append(ModelCheckpoint(args.model_output_dir + '/checkpoint-{epoch}.h5')) callbacks.append(TensorBoard(log_dir=args.model_dir,update_freq='epoch')) logging.info("Starting training") model.fit(x=train_dataset[0], y=train_dataset[1], steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size), epochs=args.epochs, validation_data=validation_dataset, validation_steps=(num_examples_per_epoch('validation') // args.batch_size), callbacks=callbacks) score = model.evaluate(eval_dataset[0], eval_dataset[1], steps=num_examples_per_epoch('eval') // args.batch_size, verbose=0) logging.info('Test loss:{}'.format(score[0])) logging.info('Test accuracy:{}'.format(score[1])) # ===============変更点======================= # # return save_model(model, args.model_dir) return save_model(model, args.model_output_dir)
def perform_setup(options): import numpy as np import sys import keras import keras.backend as K import tensorflow as tf sys.setrecursionlimit(5000) if options.with_hvd: import horovod.keras as hvd hvd.init() config = tf.ConfigProto() config.gpu_options.allow_growth=True if options.gpu > 1: devlist = '0' for i in range(1,options.gpu): devlist += ','+str(i) config.gpu_options.visible_device_list = devlist else: config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) config = tf.ConfigProto() config.gpu_options.allow_growth=True config.gpu_options.per_process_gpu_memory_fraction=0.25 K.set_session(tf.Session(config=config)) global _globalnpfile global _globalexpectedpixel global IMG_DTYPE global SEG_DTYPE global FLOAT_DTYPE global _nx global _ny # raw dicom data is usually short int (2bytes) datatype # labels are usually uchar (1byte) IMG_DTYPE = np.int16 SEG_DTYPE = np.uint8 FLOAT_DTYPE = np.float32 # _globalnpfile = options.dbfile.replace('.csv','%d.npy' % options.trainingresample ) # _globalexpectedpixel=512 _nx = options.trainingresample _ny = options.trainingresample return IMG_DTYPE, SEG_DTYPE, _nx, _ny
def init(global_batch_size, max_gpu_batch_size, gpus=None): # first of all calculate the number of GA steps and the batch size runai.elastic._init(global_batch_size, max_gpu_batch_size, gpus) # now use Horovod if needed if runai.elastic.gpus > 1: runai.utils.log.debug('Initializing Horovod') import horovod.keras as hvd hvd.init() setattr(runai.elastic, 'master', hvd.local_rank() == 0) setattr(runai.elastic, 'hvd', hvd) # so that anyone will be easily accessible to Horovod runai.utils.log.debug('Attaching Keras session to GPU #%d', hvd.local_rank()) import tensorflow config = tensorflow.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) import keras.backend keras.backend.set_session( tensorflow.Session(config=config) ) # TODO(levosos): support cases where configuration will be set afterwards
def init_keras(hvd=None): """ Set config for Horovod. Config params copied from official example: https://github.com/uber/horovod/blob/master/examples/keras_mnist_advanced.py#L15 :param hvd: instance of horovod.keras """ init_cuda_env() config = tf.ConfigProto() if hvd: hvd.init() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) set_session(tf.Session(config=config))
def main(use_horovod: bool, gpus: int, checkpoint: int, config_path: str) -> None: config = process_config(config_path, use_horovod, gpus, checkpoint) # create tensorflow session and set as keras backed tf_config = tf.ConfigProto() if config.trainer.use_horovod: import horovod.keras as hvd hvd.init() tf_config.gpu_options.allow_growth = True tf_config.gpu_options.visible_device_list = str(hvd.local_rank()) is_master = not config.trainer.use_horovod if not is_master: import horovod.keras as hvd is_master = hvd.rank() == 0 if is_master and not os.path.exists(config.exp.source_dir): # copy source files shutil.copytree( os.path.abspath(os.path.curdir), config.exp.source_dir, ignore=lambda src, names: {"datasets", "__pycache__", ".git", "experiments", "venv"}) tf_sess = tf.Session(config=tf_config) K.set_session(tf_sess) data_loader = get_data_loader(config=config) model, trainer = build_model_and_trainer(config, data_loader) print(f"Start Training Experiment {config.exp.name}") try: trainer.train() except Exception as e: send_noti_to_telegram( f"an exception raised on training {config.exp.name}") raise e
def setup_tf_config(config: DotMap): tf_config = tf.ConfigProto() if config.trainer.use_horovod: import horovod.keras as hvd hvd.init() tf_config.gpu_options.allow_growth = True tf_config.gpu_options.visible_device_list = str(hvd.local_rank()) is_master = not config.trainer.use_horovod if not is_master: import horovod.keras as hvd is_master = hvd.rank() == 0 tf_sess = tf.Session(config=tf_config) K.set_session(tf_sess) return is_master
def get_session(log_device_placement=False, allow_soft_placement=True, debug=False, device_count=None): """ TODO FIXME get_session will casue at last #Exception UnboundLocalError: "local variable 'status' referenced before assignment" in <bound method Session.__del__ of <tensorflow.python.client.session.Session object at 0x858af10>> ignored #TRACE: 03-17 08:22:26: * 0 [clear]: tag init stat error global or inside function global sess will cause this but not big problem for convenience just accpet right now """ if not hasattr(get_session, 'sess') or get_session.sess is None: if device_count is None: config=tf.ConfigProto(allow_soft_placement=allow_soft_placement, log_device_placement=log_device_placement) else: config=tf.ConfigProto(allow_soft_placement=allow_soft_placement, log_device_placement=log_device_placement, device_count=device_count) use_horovod = 'OMPI_COMM_WORLD_RANK' in os.environ if use_horovod: config.gpu_options.allow_growth = True import horovod.keras as hvd config.gpu_options.visible_device_list = str(hvd.local_rank()) # sess = tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, # config=config) #config.operation_timeout_in_ms=600000 #NOTICE https://github.com/tensorflow/tensorflow/issues/2130 but 5000 will cause init problem! #config.operation_timeout_in_ms=50000 # terminate on long hangs #https://github.com/tensorflow/tensorflow/issues/2292 allow_soft_placement=True if FLAGS.use_tpu: tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) get_session.sess = tf.Session(tpu_cluster_resolver) else: get_session.sess = tf.Session(config=config) if debug: from tensorflow.python import debug as tf_debug get_session.sess = tf_debug.LocalCLIDebugWrapperSession(get_session.sess) return get_session.sess
def create_inception_model(self, number_categories, dense_layer_sizes, dropout_fraction, unfrozen_layers, focal_loss=False): hvd.init() config = tf.compat.v1.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) opt = hvd.DistributedOptimizer(tf.keras.optimizers.Adam(learning_rate=0.001*hvd.size())) model = InceptionV3(include_top=False, pooling='avg') output = model.outputs[0] for layer_size in dense_layer_sizes: dense = Dense(layer_size, activation='relu')(output) dropout = Dropout(dropout_fraction)(dense) output = BatchNormalization()(dropout) if number_categories == 1: output = Dense(1, activation='sigmoid')(output) else: output = Dense(number_categories, activation='softmax')(output) model = Model(inputs=model.inputs, outputs=output) for index in range(len(model.layers) - unfrozen_layers): model.layers[index].trainable = False if number_categories == 1: the_metrics = [metrics.binary_accuracy] if focal_loss: loss = customlosses.focal_binary_crossentropy else: loss = 'binary_crossentropy' else: the_metrics = [metrics.categorical_accuracy] if focal_loss: loss = customlosses.focal_categorical_crossentropy else: loss = 'categorical_crossentropy' model.compile(optimizer=opt, loss=loss, metrics=the_metrics) model.save(self.model_filename) self.model = model
def perform_setup(options): sys.setrecursionlimit(5000) if options.with_hvd: import horovod.keras as hvd hvd.init() config = tf.ConfigProto() config.gpu_options.allow_growth = True if options.gpu > 1: devlist = '0' for i in range(1, options.gpu): devlist += ',' + str(i) config.gpu_options.visible_device_list = devlist else: config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) global _globalnpfile global _globalexpectedpixel global INT_DTYPE global SEG_DTYPE global _nx global _ny # raw dicom data is usually short int (2bytes) datatype # labels are usually uchar (1byte) IMG_DTYPE = np.int16 SEG_DTYPE = np.uint8 _globalnpfile = options.dbfile.replace('.csv', '%d.npy' % options.trainingresample) _globalexpectedpixel = 512 _nx = options.trainingresample _ny = options.trainingresample return IMG_DTYPE, SEG_DTYPE, _globalnpfile, _globalexpectedpixel, _nx, _ny
def main(args): mpi = False if 'sourcedir.tar.gz' in args.tensorboard_dir: tensorboard_dir = re.sub('source/sourcedir.tar.gz', 'output', args.tensorboard_dir) else: tensorboard_dir = args.tensorboard_dir logging.info("Writing TensorBoard logs to {}".format(tensorboard_dir)) if 'sagemaker_mpi_enabled' in args.fw_params: if args.fw_params['sagemaker_mpi_enabled']: import horovod.keras as hvd mpi = True # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices( gpus[hvd.local_rank()], 'GPU') else: hvd = None train_dataset = train_input_fn(hvd, mpi) eval_dataset = eval_input_fn() validation_dataset = validation_input_fn(hvd, mpi) model = keras_model_fn(args.learning_rate, args.weight_decay, args.optimizer, args.momentum, mpi, hvd) callbacks = [] if mpi: callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvd.callbacks.MetricAverageCallback()) callbacks.append( hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) callbacks.append( keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)) if hvd.rank() == 0: callbacks.append( ModelCheckpoint(args.output_dir + '/checkpoint-{epoch}.ckpt', save_weights_only=True, verbose=2)) callbacks.append( TensorBoard(log_dir=tensorboard_dir, update_freq='epoch')) else: callbacks.append( keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)) callbacks.append( ModelCheckpoint(args.output_dir + '/checkpoint-{epoch}.ckpt', save_weights_only=True, verbose=2)) callbacks.append( TensorBoard(log_dir=tensorboard_dir, update_freq='epoch')) logging.info("Starting training") size = 1 if mpi: size = hvd.size() if mpi and hvd.rank() > 0: # for horovod training, no validation for non-master nodes (rank > 0) model.fit( train_dataset, steps_per_epoch=( (num_examples_per_epoch('train') // args.batch_size) // size), epochs=args.epochs, validation_data=validation_dataset, validation_steps=( (num_examples_per_epoch('validation') // args.batch_size) // size), callbacks=callbacks, verbose=2) else: model.fit( train_dataset, steps_per_epoch=( (num_examples_per_epoch('train') // args.batch_size) // size), epochs=args.epochs, validation_data=validation_dataset, validation_steps=( (num_examples_per_epoch('validation') // args.batch_size) // size), callbacks=callbacks, verbose=2) if not mpi or (mpi and hvd.rank() == 0): score = model.evaluate(eval_dataset, steps=num_examples_per_epoch('eval') // args.batch_size, verbose=2) logging.info('Test loss:{}'.format(score[0])) logging.info('Test accuracy:{}'.format(score[1])) # Horovod: Save model only on worker 0 (i.e. master) if mpi: if hvd.rank() == 0: model.save(args.model_output_dir) else: model.save(args.model_output_dir)
from keras.datasets import mnist from keras.models import Sequential from keras.layers import Dense, Dropout, Flatten from keras.layers import Conv2D, MaxPooling2D from keras import backend as K import math import tensorflow as tf import horovod.keras as hvd # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) batch_size = 128 num_classes = 10 # Horovod: adjust number of epochs based on number of GPUs. epochs = int(math.ceil(12.0 / hvd.size())) # Input image dimensions img_rows, img_cols = 28, 28 # The data, shuffled and split between train and test sets (x_train, y_train), (x_test, y_test) = mnist.load_data() if K.image_data_format() == 'channels_first':
from __future__ import absolute_import import os import socket import keras import horovod.keras as hvd from rpv import load_file, build_model, train_model print('Distributed RPV classifier training') # Initialize horovod hvd.init() print('MPI rank %i, local rank %i, host %s' % (hvd.rank(), hvd.local_rank(), socket.gethostname())) # Data config n_train = 32000 #412416 n_valid = 16000 #137471 n_test = 16000 #137471 input_dir = '/data0/users/sfarrell/atlas-rpv-images' #input_dir = '/global/cscratch1/sd/sfarrell/atlas-rpv-images' # Load the data files train_file = os.path.join(input_dir, 'train.h5') valid_file = os.path.join(input_dir, 'val.h5') test_file = os.path.join(input_dir, 'test.h5') train_input, train_labels, train_weights = load_file(train_file, n_train) valid_input, valid_labels, valid_weights = load_file(valid_file, n_valid) test_input, test_labels, test_weights = load_file(test_file, n_test)
def train_hvd(modelCode,model, trainMap,val_df,mode,tf,learning_rate,min_max_scaler,isBinary,old_weights,startSet,startEpoch): tensor_board = TensorBoard(log_dir=tfb_log_dir, histogram_freq=1, write_graph=True, write_images=True) #isBinary = True if isBinary: classType = "BINARY" targetColName = "LABEL1" else: classType = "MULTI" targetColName = "LABEL2" if mode=="HRV": # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # with tf.Graph().as_default(): # config = tf.ConfigProto(allow_soft_placement=True) # config.gpu_options.visible_device_list = '0' K.set_session(tf.Session(config=config)) # Horovod: adjust learning rate based on number of GPUs. optimizer = keras.optimizers.Adadelta(learning_rate * hvd.size()) # Horovod: Wrap optimizer with Horovod DistributedOptimizer. optimizer = hvd.DistributedOptimizer(optimizer) # Horovod: Broadcast initial variable states from rank 0 # to all other processes. This is necessary to ensure # consistent initialization of all workers when training is # started with random weights or restored from a checkpoint. tensor_board = TensorBoard(log_dir=tfb_log_dir, histogram_freq=1, write_graph=True, write_images=True) callbacks = [tensor_board, hvd.callbacks.BroadcastGlobalVariablesCallback(0)] #modelNameList = ["buildBinaryClassModel","buildMultipleClassModel","buildMultiAttentionModel"] #modelCodeMap = {"LSTM":"buildBinaryClassModel", "BDLSTM":"buildMultipleClassModel","BDLSTM_ATTEN":"buildMultiAttentionModel"} else: v_optimizer = keras.optimizers.Adam(lr=learning_rate) #v_optimizer = keras.optimizers.RMSprop(lr=learning_rate) #v_optimizer = keras.optimizers.SGD(lr=learning_rate, clipvalue=1) #v_optimizer = keras.optimizers.SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True) callbacks = [tensor_board] print("Start Train Model ",mode) cLoop = 1 #trainedSets = {} test_df={} resultMetric = {} score = 0 rolling_win_size = 60 isNotSatisfacEval = True nEpochs = 10 lossOptimal = False history ="" score = 0 cvscores = [] curVLoss = 0.001 resutl = [] maxBatch = 10 val_seq_array, val_label_array = gen_data_test_val(targetColName, val_df,sequence_length, sequence_cols) #val_seq_array, val_label_array, nb_features, nb_out = gen_data_train_val(targetColName, val_df,sequence_length, sequence_cols) ## Multiple Classifications #val_label_array = to_categorical(val_label_array, num_classes=3, dtype='int32') for nEpoch in range(nEpochs): countTrainSet = 1 trainDataSetKeys = trainMap.keys() #Hyperparameters v_batch_size = 200 v_validation_split = 0.05 v_verbose = 2 #verbose: Integer. 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch v_LSTMUnitLayer1 = 150 v_LSTMUnitLayer2 = 60 v_LSTMUnitLayer3 = 30 v_Dropout = 0.2 v_maxEpoch = 1 scores_test = [] for trainKey in trainDataSetKeys: if (trainKey>=startSet and nEpoch>=startEpoch): if isNotSatisfacEval is True: print("Starting Loop (cLoop) : ",str(cLoop)) print("Train model using dataset {",str(trainKey),"}") isTrainSet = True train_df_new = getDataFromCSV(sqlContext, dbFSDir,trainMap[trainKey], selectedCols,isTrainSet,isBinary) ##Correct Sample Labels train_df_new = genSampleLabel(train_df_new) ##train_df = train_df.append(train_df) train_df_new = train_df_new.sort_values(['CODE','YEAR','EVENT_ID','CYCLE']) train_df_new = add_features(train_df_new, rolling_win_size , sensor_cols) train_df_new = train_df_new.drop(columns=columns_to_drop) #train_df_new,min_max_scaler = normalizeMaxMinTrain(train_df_new,min_max_scaler) train_df_new = train_df_new.sort_values(['EVENT_ID','CYCLE']) train_df_new = train_df_new.drop_duplicates(['EVENT_ID','CYCLE'], keep='last') printDFPortion(train_df_new, val_df, targetColName) seq_array, label_array, nb_features, nb_out = gen_data_train_val(targetColName, train_df_new,sequence_length, sequence_cols) # print("Finish Gen Train Data Sequence") # print("Finish Gen Validate Data Sequence") # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. # Horovod: Save checkpoints only on worker 0 to prevent # other workers from overwriting and corrupting them. ###checkpoint_dir = dataLake if mode=="HRV": if hvd.rank() == 0: callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_dir+ str(cLoop)+'checkpoint-hvd.hdf5', save_weights_only=True)) original_label_array = label_array #Multiple Classification #label_array = to_categorical(label_array, num_classes=3, dtype='int32') # val_label_array = to_categorical(val_label_array, num_classes=3, dtype='int32') nb_classes=label_array.shape[1] vb_classes=val_label_array.shape[1] # print("label_array : nb_classes: ",nb_classes) # print("val_label_array : vb_classes: ",vb_classes) if len(old_weights)==0 and classType=="MULTI": model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer1, return_sequences=True),input_shape=(sequence_length, nb_features),merge_mode='concat')) # print("Created Bidirectional 1") model.add(Dropout(v_Dropout)) model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer2,return_sequences=True))) # print("Created Bidirectional 2") model.add(Dropout(v_Dropout)) model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer3,return_sequences=False))) # print("Created Bidirectional 3") model.add(Dropout(v_Dropout)) model.add(Dense(units=nb_classes,activation='softmax')) elif len(old_weights)==0 and classType=="BINARY": model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer1, return_sequences=True),input_shape=(sequence_length, nb_features),merge_mode='concat')) model.add(Dropout(v_Dropout)) # print("Created Bidirectional 1") model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer2,return_sequences=False))) model.add(Dropout(v_Dropout)) # print("Created Bidirectional 2") model.add(Dense(units=nb_out, activation='sigmoid')) print("nb_out:",nb_out) else: print("Model Already Constructed.") try: if old_weights!="": model.set_weights(old_weights) print("Reset weights successfully.") except: print("Failed reset weights.") pass # try: # model = multi_gpu_model(model,gpus=2) # print("Training using multiple GPUs..") # except: # print("Training using single GPU or CPU..") # pass if nb_classes>2: model.compile(loss='categorical_crossentropy', optimizer=v_optimizer, metrics=['accuracy']) print("set loss: categorical_crossentropy ") else: model.compile(loss='binary_crossentropy', optimizer=v_optimizer, metrics=['accuracy']) print("set loss: binary_crossentropy ") print(model.summary()) processCode = str(cLoop)+"_R_"+str(trainKey) if mode=="HRV": if hvd.rank() == 0: callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_dir + '/'+processCode+'_checkpoint-{epoch}.h5')) ### Utilizing Horovod history = model.fit(seq_array, label_array, batch_size=v_batch_size, epochs=v_maxEpoch, verbose=2, #validation_data=(val_seq_array, val_label_array), validation_split=v_validation_split, callbacks = callbacks) try: old_weights = model.get_weights() # evaluate the model except: print("Error get_weights !") # list all data in history print(history.history.keys()) #val_seq_array, val_label_array = gen_data_test_val(targetColName, val_df,sequence_length, sequence_cols) #val_label_array = to_categorical(val_label_array, num_classes=3, dtype='int32') # cm,precision_test,recall_test,f1_test, y_true_label, y_predicted = evaluationMetrics(val_seq_array,val_label_array,isBinary,model) #printProb(model,val_seq_array , val_label_array) try: #cm,precision_test,recall_test,f1_test, y_true_label, y_predicted, y_pred_prop, y_pred_prob_thrldeshod = evaluationMetrics(val_seq_array,val_label_array,isBinary,model) cm,precision_test,recall_test,f1_test, y_true_label, y_pred_class, y_pred_prop, y_pred_prob_threshold = evaluationMetrics(val_seq_array,val_label_array,isBinary,model) except: precision_test = 0 recall_test = 0 f1_test=0 print("Error in evaluation performance [evaluationMetrics]!") #return model pass if len(old_weights)==0: print("Error Empty Weights!!") else: print("Has weights!!") if mode!="HRV": try: currentModelPath = processCode + "_"+model_path print("Trying to save model : "+currentModelPath) model.save(currentModelPath) try: fromPath = "file:/databricks/driver/"+currentModelPath print("Copying file [",fromPath,"] to Data Lake....") copyData(fromPath, dataLake+"/model",False) except: print("Error while trying to transfer file "+"file:/databricks/driver/"+currentModelPath," to ",dataLake+"/model") pass print("Model Saved >> ",currentModelPath) except: print("Error Saving Model",currentModelPath) pass try: lossOptimal, score, result, curVLoss = isOptimal(history,countTrainSet,score,curVLoss,nEpoch) #resultMetric[cLoop] = [cLoop, processCode] + result resultMetric[cLoop] = [cLoop, processCode] + result + [precision_test,recall_test,f1_test] print(resultMetric) saveFileToDataLake(resultMetric) except: print("Erro write metric file.") pass if lossOptimal is False: countTrainSet=countTrainSet+1 else: break cLoop = cLoop+1 else: print("Skip DataSet:",trainKey) else: print("Train and evaluation is satisfactory!") break return model
#import tensorflow as tf #config = tf.ConfigProto(log_device_placement=True) from EcalEnergyGan import generator, discriminator import tensorflow as tf import horovod.keras as hvd # Initialize Horovod. hvd.init() # Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True import time time.sleep(10 * hvd.local_rank()) import setGPU #config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.Session(config=config) g_weights = 'params_generator_epoch_' d_weights = 'params_discriminator_epoch_' nb_epochs = 25 batch_size = 128 latent_size = 200 verbose = 'false' nb_classes = 2 generator = generator(latent_size) discriminator = discriminator()
resume_training = False elif opt in ('-l', '--length'): predict_length = int(arg) elif opt in ('-m', '--mgpu'): multi_gpu = True elif opt in ('-e', '--epoch'): epoch = int(arg) if multi_gpu: import tensorflow as tf import horovod.keras as hvd hvd.init() config = tf.ConfigProto() config.gpu_options.allow_growth = True print('hdv.local_rank: ', hvd.local_rank()) config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) """ Keras2 based WaveNet Based originally on Bas Veeling's implementation at: https://github.com/basveeling/wavenet/ which is (c) Bas Veeling Copyright (c) MUNICH ARTIFICIAL INTELLIGENCE LABORATORIES GmbH. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
def main(args): #initialize Horovod. hvd.init() config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) fold = args.data_path.split("fold_")[1] if hvd.rank()==0: print("================================") if args.use_lovasz: print("Fine tuning with ") print("Fold {}".format(fold)) #Find best saved model best_model_file = 'weights/{}/fold_{}_{epoch}_best.h5'.format(args.model, fold, epoch='{epoch}') resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(best_model_file.format(epoch=try_epoch)): resume_from_epoch = try_epoch break if hvd.rank()==0: print("Last model saved: {}".format(best_model_file.format(epoch=resume_from_epoch))) resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') #verbose mode for one node if hvd.rank()==0: verbose = 1 else: verbose = 0 #Create dataset dataset = TGSDataset(data_path=args.data_path, batch_size=args.batch_size) input_shape = (args.target_size, args.target_size) mask_shape = (101, 101) train_data_generator = dataset.get_train_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand()) val_data_generator = dataset.get_val_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand()) train_step_size = dataset.train_step_size // hvd.size() val_step_size = dataset.val_step_size // hvd.size() #Create model model = make_model(args.model, (args.target_size, args.target_size, 3), 2) #load weights if resume_from_epoch > 0: model.load_weights(best_model_file.format(epoch=resume_from_epoch)) size = hvd.size() opt = hvd.DistributedOptimizer(SGD(lr=args.learning_rate * size, momentum=0.9, nesterov=True)) #Loss loss = losses.c_lovasz_loss if args.use_lovasz else losses.c_binary_crossentropy model.compile(loss=loss, optimizer=opt, metrics=[metrics.c_binary_accuracy, metrics.c_iou]) #h5 model best_model = ModelCheckpointMGPU(model, filepath=best_model_file, monitor='val_loss', verbose=1, mode='min', period=1, save_best_only=True, save_weights_only=True) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard, or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, verbose=True) ] # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append(keras.callbacks.TensorBoard(args.log_dir)) callbacks.append(best_model) #Fit model history = model.fit_generator(train_data_generator, steps_per_epoch=train_step_size, callbacks=callbacks, epochs=args.epochs, verbose=verbose, workers=4, initial_epoch=resume_from_epoch, validation_data=val_data_generator, validation_steps=val_step_size) score = hvd.allreduce(model.evaluate_generator(val_data_generator, val_step_size, workers=4)) print('Test loss:', score[0]) print('Test accuracy:', score[1])
def main(args): if 'sourcedir.tar.gz' in args.tensorboard_dir: tensorboard_dir = re.sub('source/sourcedir.tar.gz', 'model', args.tensorboard_dir) else: tensorboard_dir = args.tensorboard_dir logging.info("Writing TensorBoard logs to {}".format(tensorboard_dir)) if os.path.isdir(args.checkpoint_path): logging.info("Checkpointing directory {} exists".format( args.checkpoint_path)) else: logging.info("Creating Checkpointing directory {}".format( args.checkpoint_path)) os.mkdir(args.checkpoint_path) mpi = False if 'sagemaker_mpi_enabled' in args.fw_params: if args.fw_params['sagemaker_mpi_enabled']: import horovod.keras as hvd mpi = True # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) else: hvd = None logging.info("Running with MPI={}".format(mpi)) logging.info("getting data") train_dataset = train_input_fn() eval_dataset = eval_input_fn() validation_dataset = validation_input_fn() logging.info("configuring model") # Load model if not os.listdir(args.checkpoint_path): model = keras_model_fn(args.learning_rate, args.weight_decay, args.optimizer, args.momentum, mpi, hvd) epoch_number = 0 else: model, epoch_number = load_checkpoint_model(args.checkpoint_path) logging.info("Checkpointing to: {}".format(args.checkpoint_path)) callbacks = [] if mpi: callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvd.callbacks.MetricAverageCallback()) callbacks.append( hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) callbacks.append( keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)) if hvd.rank() == 0: callbacks.append( ModelCheckpoint(args.checkpoint_path + '/checkpoint-{epoch}.h5')) callbacks.append( TensorBoard(log_dir=tensorboard_dir, update_freq='epoch')) else: callbacks.append( keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)) callbacks.append( ModelCheckpoint(args.checkpoint_path + '/checkpoint-{epoch}.h5')) callbacks.append( TensorBoard(log_dir=tensorboard_dir, update_freq='epoch')) logging.info("Starting training") size = 1 if mpi: size = hvd.size() model.fit( x=train_dataset[0], y=train_dataset[1], steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size) // size, epochs=args.epochs, initial_epoch=epoch_number, validation_data=validation_dataset, validation_steps=(num_examples_per_epoch('validation') // args.batch_size) // size, callbacks=callbacks) score = model.evaluate(eval_dataset[0], eval_dataset[1], steps=num_examples_per_epoch('eval') // args.batch_size, verbose=0) logging.info('Test loss:{}'.format(score[0])) logging.info('Test accuracy:{}'.format(score[1])) # Horovod: Save model only on worker 0 (i.e. master) if mpi: if hvd.rank() == 0: save_model(model, args.model_output_dir) else: save_model(model, args.model_output_dir)
def main(): """Main function""" # Initialize horovod hvd.init() # Parse the command line args = parse_args() # Setup logging log_format = '%(asctime)s %(levelname)s %(message)s' logging.basicConfig(level=logging.INFO, format=log_format) logging.info('Initializing') if args.show_config: logging.info('Command line config: %s' % args) logging.info('MPI rank %i, local rank %i, host %s' % (hvd.rank(), hvd.local_rank(), socket.gethostname())) # Load configuration file with open(args.config) as f: config = yaml.load(f) logging.info('Configuration: %s' % config) # Load the data files train_data, valid_data, test_data = load_dataset(**config['data_config']) train_input, train_labels, train_weights = train_data valid_input, valid_labels, valid_weights = valid_data test_input, test_labels, test_weights = test_data logging.info('train shape: %s Mean label %s' % (train_input.shape, train_labels.mean())) logging.info('valid shape: %s Mean label %s' % (valid_input.shape, valid_labels.mean())) logging.info('test shape: %s Mean label %s' % (test_input.shape, test_labels.mean())) # Configure the session (e.g. thread settings) keras.backend.set_session(configure_session(**config['session_config'])) # Scale the learning rate model_config = config['model_config'] if model_config.pop('scale_learning_rate'): model_config[ 'learning_rate'] = model_config['learning_rate'] * hvd.size() # Build the model logging.info(config) model = build_model(train_input.shape[1:], use_horovod=True, **model_config) if hvd.rank() == 0: model.summary() # Training hooks callbacks = [] # Horovod model synchronization during initialization callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) # Model checkpointing if hvd.rank() == 0: checkpoint_file = os.path.expandvars(config['checkpoint_file']) os.makedirs(os.path.dirname(checkpoint_file), exist_ok=True) callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_file)) # Batch size training_config = config['training_config'] bsize = training_config['batch_size'] per_node = training_config.pop('batch_size_per_node') training_config['batch_size'] = bsize if per_node else (bsize // hvd.size()) # Run the training logging.info('Final training config: %s' % training_config) history = model.fit(x=train_input, y=train_labels, validation_data=(valid_input, valid_labels), callbacks=callbacks, verbose=2, **training_config) # Evaluate on the test set test_loss, test_acc = model.evaluate(test_input, test_labels, verbose=2) logging.info('Test loss: %g' % test_loss) logging.info('Test accuracy: %g' % test_acc) # Drop to IPython interactive shell if args.interactive: logging.info('Starting IPython interactive session') import IPython IPython.embed() logging.info('All done!')
help='Training dataset Name.', dest='dataset_type') args = parser.parse_args() # Checkpoints will be written in the log directory. args.checkpoint_format = os.path.join(args.output_path, 'checkpoint-{epoch}.h5') # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) # If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')
from keras.layers import Conv2D, MaxPooling2D from keras import backend as K import math import tensorflow as tf import horovod.keras as hvd # Initialize Horovod hvd.init() # Pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') print(gpus) for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') batch_size = 128 num_classes = 10 # Horovod: adjust number of epochs based on number of GPUs. epochs = int(math.ceil(12.0 / hvd.size())) # input image dimensions img_rows, img_cols = 28, 28 # the data, split between train and test sets (x_train, y_train), (x_test, y_test) = mnist.load_data() if K.image_data_format() == 'channels_first': x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
def init_callbacks(self) -> None: if self.config.trainer.use_lr_decay: # linear decay from the half of max_epochs def lr_scheduler(lr, epoch, max_epochs): return min(lr, 2 * lr * (1 - epoch / max_epochs)) self.model_callbacks["combined"].append( LearningRateScheduler(schedule=lambda epoch: lr_scheduler(self.config.model.generator.lr, epoch, self.config.trainer.num_epochs))) for model_name in ['d_x', 'd_y']: self.model_callbacks[model_name].append( LearningRateScheduler(schedule=lambda epoch: lr_scheduler(self.config.model.discriminator.lr, epoch, self.config.trainer.num_epochs))) # if horovod used, only worker 0 saves checkpoints is_master = True is_local_master = True if self.config.trainer.use_horovod: import horovod.keras as hvd is_master = hvd.rank() == 0 is_local_master = hvd.local_rank() == 0 # horovod callbacks if self.config.trainer.use_horovod: import horovod.keras as hvd self.model_callbacks["combined"].append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) self.model_callbacks["combined"].append(hvd.callbacks.MetricAverageCallback()) self.model_callbacks["combined"].append( hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) if is_local_master: # model saver self.model_callbacks["serial_combined"].append( ModelCheckpointWithKeepFreq( filepath=os.path.join(self.config.exp.checkpoints_dir, "{epoch:04d}-combined.hdf5"), keep_checkpoint_freq=self.config.trainer.keep_checkpoint_freq, save_checkpoint_freq=self.config.trainer.save_checkpoint_freq, save_best_only=False, save_weights_only=True, verbose=1)) # save optimizer weights for model_name in ['combined', 'd_x', 'd_y']: self.model_callbacks[model_name].append(OptimizerSaver(self.config, model_name)) if is_master: # save individual models for model_name in ['g_xy', 'g_yx', 'd_x', 'd_y']: self.model_callbacks[model_name].append( ModelSaver( checkpoint_dir=self.config.exp.checkpoints_dir, keep_checkpoint_freq=self.config.trainer.keep_checkpoint_freq, model_name=model_name, num_epochs=self.config.trainer.num_epochs, verbose=1)) # send notification to telegram channel on train start and end self.model_callbacks["combined"].append(TrainProgressAlertCallback(experiment_name=self.config.exp.name, total_epochs=self.config.trainer.num_epochs)) # tensorboard callback self.model_callbacks["combined"].append( ScalarCollageTensorBoard(log_dir=self.config.exp.tensorboard_dir, batch_size=self.config.trainer.batch_size, write_images=True)) # initialize callbacks by setting model and params epochs = self.config.trainer.num_epochs steps_per_epoch = self.data_loader.get_train_data_size() // self.config.trainer.batch_size for model_name in self.model_callbacks: model = eval(f"self.{model_name}") callbacks = self.model_callbacks[model_name] for callback in callbacks: callback.set_model(model) callback.set_params({ "batch_size": self.config.trainer.batch_size, "epochs": epochs, "steps": steps_per_epoch, "samples": self.data_loader.get_train_data_size(), "verbose": True, "do_validation": False, "model_name": model_name, })
import os import horovod.keras as hvd from utils.device import configure_session distributed = False rank, n_ranks = 0, 1 if distributed: hvd.init() rank, n_ranks = hvd.rank(), hvd.size() if rank == 0: print('rank {}, n_ranks {}'.format(rank, n_ranks)) if n_ranks > 1: gpu = hvd.local_rank() configure_session(gpu=gpu) profile_downsample = 2 ''' efit_type='EFITRT1' input_profile_names = ['thomson_dens_{}'.format(efit_type), 'thomson_temp_{}'.format(efit_type)] target_profile_names = ['temp', 'dens'] actuator_names = ['pinj', 'curr', 'tinj', 'gasA'] profile_lookback = 1 actuator_lookback = 10 ''' if True: processed_filename_base = '/global/cscratch1/sd/abbatej/processed_data/'