def setup_callbacks(params, callbacks, encoder, decoder, prop_pred): import horovod.keras as hvd # model checkpointing if params.checkpoint_period and hvd.rank() == 0: model_checkpoint_callback = model_checkpoint( encoder, decoder, prop_pred, params.checkpoint_path, nepochs=params.checkpoint_period, overwrite=params.overwrite_checkpoint) callbacks.append(model_checkpoint_callback) # LR scheduler if params.lr_schedule_patience: lr_callback = ReduceLROnPlateau(monitor=params.lr_schedule_prop, factor=0.5, patience=params.lr_schedule_patience, min_lr=params.lr_schedule_min * hvd.size(), cooldown=params.lr_schedule_cooldown, verbose=(hvd.rank() == 0)) callbacks.append(lr_callback) if hvd.rank() == 0: callbacks.append(print_loss()) if params.enable_tensorboard: callbacks.append(TensorBoard(params.checkpoint_path))
def test_load_model_broadcast(self): def create_model(): opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') return model with self.test_session(config=self.config) as sess: K.set_session(sess) model = create_model() x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) if hvd.rank() == 0: _, fname = tempfile.mkstemp('.h5') model.save(fname) K.clear_session() with self.test_session(config=self.config) as sess: K.set_session(sess) if hvd.rank() == 0: model = hvd.load_model(fname) os.remove(fname) else: model = create_model() def generator(): while 1: yield (x, y) if hvd.rank() == 0: self.assertEqual(len(model.optimizer.weights), 5) else: self.assertEqual(len(model.optimizer.weights), 0) # No assertions, we just need to verify that it doesn't hang callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)] model.fit_generator(generator(), steps_per_epoch=10, callbacks=callbacks, epochs=0, verbose=0, workers=4, initial_epoch=1) self.assertEqual(len(model.optimizer.weights), 5)
def test_elastic_state(self): with self.test_session(config=self.config) as sess: K.set_session(sess) v = 1.0 if hvd.rank() == 0 else 2.0 model1 = keras.models.Sequential([ keras.layers.Dense(2, activation='softmax') ]) model1.build((2, 2)) model1.set_weights( [np.array([[v, v], [v, v]], dtype=np.float32), np.array([v, v], dtype=np.float32)]) model2 = keras.models.Sequential([ keras.layers.Dense(2, activation='softmax') ]) model2.build((2, 2)) model2.set_weights( [np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), np.array([0.0, 0.0], dtype=np.float32)]) optimizer = keras.optimizers.Adam(0.001 * hvd.size()) state = hvd.elastic.KerasState(model1, optimizer, batch=20 + hvd.rank(), epoch=10 + hvd.rank()) state.sync() model1_weights = model1.get_weights() model2_weights = model2.get_weights() # After sync, all values should match the root rank for w in state.model.get_weights(): self.assertAllClose(w, np.ones_like(w)) assert state.batch == 20 assert state.epoch == 10 # Partially modify then restore model1.set_weights(model2_weights) state.batch = 21 state.epoch = 11 state.restore() for w1, w2 in zip(model1.get_weights(), model1_weights): self.assertAllClose(w1, w2) assert state.batch == 20 assert state.epoch == 10 # Partially modify then commit model1.set_weights(model2_weights) state.batch = 21 state.epoch = 11 state.commit() state.restore() for w1, w2 in zip(model1.get_weights(), model2_weights): self.assertAllClose(w1, w2) assert state.batch == 21 assert state.epoch == 11
def initialize(self): # init_op = tf.initialize_all_variables() # init_op = tf.global_variables_initializer() # sess = tf.Session() # sess.run(init_op) # Check if GPUs are available # if tf.test.is_gpu_available(): # commented out since this test will cause a new session be created # allow growth # config = tf.compat.v1.ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction = 1 # config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU # # config.log_device_placement = True # to log device placement (on which device the operation ran) # sess = tf.compat.v1.Session(config=config) # tf.compat.v1.keras.backend.set_session(sess) # set this TensorFlow session as the default session for Keras # Create logger self.logger = logging.getLogger('DeepGalaxyTrain') self.logger.setLevel(self.log_level) self.logger.addHandler(logging.FileHandler('train_log.txt')) if self.distributed_training is True: try: import horovod.tensorflow.keras as hvd # initialize horovod hvd.init() self.callbacks.append( hvd.callbacks.BroadcastGlobalVariablesCallback(0)) self.callbacks.append(hvd.callbacks.MetricAverageCallback()) # self.callbacks = [hvd.BroadcastGlobalVariablesHook(0)] if hvd.rank() == 0: self.logger.info('Parallel training enabled.') self.logger.info( 'batch_size = %d, global_batch_size = %d, num_workers = %d\n' % (self.batch_size, self.batch_size * hvd.size(), hvd.size())) # Map an MPI process to a GPU (Important!) print('hvd_rank = %d, hvd_local_rank = %d' % (hvd.rank(), hvd.local_rank())) self.logger.info('hvd_rank = %d, hvd_local_rank = %d' % (hvd.rank(), hvd.local_rank())) # Bind a CUDA device to one MPI process (has no effect if GPUs are not used) os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) # # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # if gpus: # tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') except ImportError as identifier: print( 'Error importing horovod. Disabling distributed training.') self.distributed_training = False else: self.logger.info('Parallel training disabled.') self.logger.info('Batch_size = %d' % (self.batch_size))
def create_callbacks(model, training_model, prediction_model, validation_generator, args, verbose): # Create Horovod callback callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), hvd.callbacks.MetricAverageCallback(), hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=verbose) ] if hvd.rank() == 0 and args.output_path: # only one worker saves the checkpoint file # Create a snapshot for the Epoch callbacks.append( keras.callbacks.ModelCheckpoint( os.path.join( args.output_path, 'model.h5' ) ) ) tensorboard_callback = None if (args.tensorboard_dir) and (hvd.rank() == 0): tensorboard_callback = keras.callbacks.TensorBoard( log_dir = args.tensorboard_dir, histogram_freq = 0, batch_size = args.batch_size, write_graph = True, write_grads = False, write_images = False, embeddings_freq = 0, embeddings_layer_names = None, embeddings_metadata = None ) callbacks.append(tensorboard_callback) # if args.evaluation and validation_generator: # if args.dataset_type == 'coco': # from ..callbacks.coco import CocoEval # # use prediction model for evaluation # evaluation = CocoEval(validation_generator, tensorboard=tensorboard_callback) # else: # evaluation = Evaluate(validation_generator, tensorboard=tensorboard_callback) # evaluation = RedirectModel(evaluation, prediction_model) # callbacks.append(evaluation) callbacks.append(keras.callbacks.ReduceLROnPlateau( monitor = 'loss', factor = 0.1, patience = 2, verbose = 1, mode = 'auto', epsilon = 0.0001, cooldown = 0, min_lr = 0 )) return callbacks
def getModel(net_settings, num_classes=1): ''' Should be modified with model type as input and returns the desired model ''' if net_settings['model_type'] == 'resnet': base_model = resnet50.ResNet50(include_top=True, weights='imagenet') finetuning = Dense(1, activation='sigmoid', name='predictions')(base_model.layers[-2].output) model = Model(input=base_model.input, output=finetuning) ## Adjust learning rate based on number of GPUs hv_lr = net_settings['lr'] * hvd.size() opt = optimizers.SGD(lr=hv_lr, momentum=0.9, decay=1e-6, nesterov=True) ## Adding Horovod DistributedOptimizer opt = hvd.DistributedOptimizer(opt) model.compile(loss=net_settings['loss'], optimizer=opt, metrics=['accuracy']) callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] if hvd.rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) return model elif net_settings['model_type'] == 'resnet101': model = resnet101_model(224, 224, 3, 1) ## Adjust learning rate based on number of GPUs hv_lr = net_settings['lr'] * hvd.size() opt = optimizers.SGD(lr=hv_lr, momentum=0.9, decay=1e-6, nesterov=True) ## Adding Horovod DistributedOptimizer opt = hvd.DistributedOptimizer(opt) model.compile(loss=net_settings['loss'], optimizer=opt, metrics=['accuracy']) callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] if hvd.rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) return model else: print '[models] Ugggh. Not ready for this yet.' exit(0) return None
def generate_train_patch_using_sharing(self, batch_size): comm = MPI.COMM_WORLD if hvd.rank() == 0: batch_patch_info = self.cnes_gen.choose_patches_for_iteration( batch_size * hvd.size()) # batch_img, batch_gt = cnes_gen.generate_train_patch_fast(BATCH_SIZE, batch_patch_info) transfers = self.get_batch_sharing_solution(batch_patch_info) for k in range(1, hvd.size()): comm.send(batch_patch_info, dest=k, tag=1001) comm.send(transfers, dest=k, tag=1002) else: batch_patch_info = comm.recv(source=0, tag=1001) transfers = comm.recv(source=0, tag=1002) # batch_patch_info = np.zeros((6, batch_size * hvd.size()), np.int32) # transfers = np.zeros((hvd.size(), hvd.size()), np.int32) # batch_patch_info = hvd.broadcast(batch_patch_info, root_rank=0, name="BATCH_PATCH_INFO") # transfers = hvd.broadcast(transfers, root_rank=0, name="TRANSFERS") # batch_patch_info = comm.bcast(batch_patch_info, root=0) # transfers = comm.bcast(transfers, root=0) return self.get_batch_using_sharing(batch_size, batch_patch_info, transfers)
def load_data(self, data_fn, test_size=0.3, random=True): if not self.distributed_training: self.logger.info( 'Loading the full dataset since distributed training is disabled ...' ) # X, Y = self.data_io.load_all(data_fn, dset_name_pattern=dset_name_pattern, camera_pos=camera_pos) X, Y = self.data_io.load_all(data_fn) else: self.logger.info( 'Loading part of the dataset since distributed training is enabled ...' ) X, Y = self.data_io.load_partial(data_fn, hvd.size(), hvd.rank()) self.logger.debug('Shape of X: %s' % str(X.shape)) self.logger.debug('Shape of Y: %s' % str(Y.shape)) # update the input_shape setting according to the loaded data self.input_shape = X.shape[1:] if test_size > 0: x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=test_size, random_state=42) self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test else: self.x_train = X self.y_train = Y self.num_classes = np.unique(Y).shape[0] print("shapes:", self.x_train.shape, self.x_test.shape, self.y_train.shape, self.y_test.shape) self.logger.debug('Number of classes: %d' % self.num_classes)
def _get_rank(): if _DISTRIBUTED: try: return hvd.rank() except: return 0 else: return 0
def _is_master(is_distributed=_DISTRIBUTED): if is_distributed: if hvd.rank() == 0: return True else: return False else: return True
def _get_model_dir(is_distributed=_DISTRIBUTED): if is_distributed: # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. return (os.getenv("AZ_BATCHAI_OUTPUT_MODEL") if hvd.rank() == 0 else os.getenv("AZ_BATCHAI_JOB_TEMP_DIR")) else: return os.getenv("AZ_BATCHAI_OUTPUT_MODEL")
def readData(file): if not useHorovod or hvd.rank() == 0: logger.info("reading in training %s", file) loaded_data = zarr.open(file, mode='r') x_train = loaded_data['train'] y_train = loaded_data['test'] return x_train,y_train
def test_horovod(self): import horovod.keras as hvd self.assertEqual(hvd.init(), 1) self.assertEqual(hvd.rank(), 0) print( '\nNOTE: remember to also test horovod with a real script, for example' ) print( 'https://github.com/CSCfi/machine-learning-scripts/blob/master/examples/keras-dvc-cnn-simple-hvd.py' )
def train_evaluate(): # Generate training and validation data generators def get_image_list(data_dir): dataset = [] for folder in os.listdir(data_dir): for image in os.listdir(os.path.join(data_dir, folder)): dataset.append((os.path.join(data_dir, folder, image), folder)) return dataset training_data = ImageSequence(get_image_list(os.path.join(FLAGS.data_dir, 'train')), FLAGS.batch_size, True) validation_data = ImageSequence(get_image_list(os.path.join(FLAGS.data_dir, 'test')), FLAGS.batch_size, False) # Horovod: Initialize Horovod hvd.init() # Horvod: Pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.keras.backend.set_session(tf.Session(config=config)) # Create a model model = network_model(FLAGS.hidden_units) loss = 'categorical_crossentropy' # Horovod: Adjust learning rate based on number of GPUs optimizer = Adadelta(lr=1.0 * hvd.size()) # Horovod: add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer(optimizer) metrics = ['acc'] model.compile(optimizer, loss, metrics) # Set up callbacks callbacks = [ # Broadcast initial variable states from rank 0 to all other processes hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] # Horovod: save logs only on worker 0 if hvd.rank() == 0: callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=FLAGS.log_dir)) # Start training model.fit_generator(generator = training_data, validation_data = validation_data, epochs = FLAGS.epochs, use_multiprocessing = True, workers = 4, callbacks = callbacks, verbose = 1) # Save the model model.save(FLAGS.save_model_path)
def _main(): hvd.init() better_exceptions.MAX_LENGTH = 128 _MODELS_DIR.mkdir(parents=True, exist_ok=True) logger = tk.log.get() logger.addHandler(tk.log.stream_handler()) if hvd.rank() == 0: logger.addHandler( tk.log.file_handler(_MODELS_DIR / 'train.log', append=True)) with tk.dl.session( gpu_options={'visible_device_list': str(hvd.local_rank())}): _run()
def __init__(self, filename, batch_size): self.f_array = h5py.File(filename, "r") x = self.f_array["images"] y = self.f_array["masks"] self.batch_size = batch_size node_array_size = int(np.ceil(len(x) / hvd.size())) self.init_array = hvd.rank() * node_array_size self.end_array = self.init_array + node_array_size self.x = x self.y = y print("calculating size") print("size", len(self))
def setup_hvd_callbacks(params, callbacks, encoder, decoder, prop_pred): import horovod.keras as hvd # Horovod: broadcast initial variable states callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) # Horovod: average metrics among workers at the end of every epoch. callbacks.append(hvd.callbacks.MetricAverageCallback()) # Horovod: Scale the learning rate * hvd.size()` callbacks.append( hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=(hvd.rank() == 0)))
def main(args): # ===========変更点============= # import horovod.keras as hvd hvd.init() config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) logging.info("getting data") train_dataset = train_input_fn() eval_dataset = eval_input_fn() validation_dataset = validation_input_fn() # ===============変更点======================= # logging.info("configuring model") model = keras_model_fn(args.learning_rate, args.weight_decay, args.optimizer, args.momentum, hvd) callbacks = [] # ===============変更点======================= # # callbacks.append(ModelCheckpoint(args.model_dir + '/checkpoint-{epoch}.h5')) callbacks.append(ModelCheckpoint(args.model_output_dir + '/checkpoint-{epoch}.h5')) # ===============変更点======================= # callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvd.callbacks.MetricAverageCallback()) callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) # ===============変更点======================= # if hvd.rank() == 0: callbacks.append(ModelCheckpoint(args.model_output_dir + '/checkpoint-{epoch}.h5')) callbacks.append(TensorBoard(log_dir=args.model_dir,update_freq='epoch')) logging.info("Starting training") model.fit(x=train_dataset[0], y=train_dataset[1], steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size), epochs=args.epochs, validation_data=validation_dataset, validation_steps=(num_examples_per_epoch('validation') // args.batch_size), callbacks=callbacks) score = model.evaluate(eval_dataset[0], eval_dataset[1], steps=num_examples_per_epoch('eval') // args.batch_size, verbose=0) logging.info('Test loss:{}'.format(score[0])) logging.info('Test accuracy:{}'.format(score[1])) # ===============変更点======================= # # return save_model(model, args.model_dir) return save_model(model, args.model_output_dir)
def print_global_running_stats(self): stats = self.cnes_gen.get_running_stats() CLASS_ID_SET = self.cnes_gen.get_class_ids() print("stats at rank {} : {}".format(hvd.rank(), stats)) stats_mat = np.zeros((len(CLASS_ID_SET) + 1, 2), np.float32) stats_mat[0, 1] = stats[0] idx = 1 for cid in CLASS_ID_SET: stats_mat[idx, 0] = cid if cid in stats: stats_mat[idx, 1] = stats[cid] idx += 1 print("Gathering stats from all MPI instances, rank {}".format( hvd.rank())) all_stats = hvd.allgather(stats_mat) # comm.gather(stats, root=0) total_px = 0 if hvd.rank() == 0: # print("Epoch {} class freqs:".format(self.epoch)) class_stats = {class_id: 0 for class_id in CLASS_ID_SET} for class_id in CLASS_ID_SET: # print("Data for class {}: {}".format(class_id, all_stats[all_stats[:,0] == class_id, :])) px_class = np.sum(all_stats[all_stats[:, 0] == class_id, 1]) class_stats[class_id] += px_class total_px += px_class non_annot_px = np.sum(all_stats[all_stats[:, 0] == 0, 1]) total_px += non_annot_px print("Non annotated pixels : {}".format(non_annot_px)) for class_id in class_stats: print("Class {} count = {}, freq {:.5f}%".format( class_id, class_stats[class_id], class_stats[class_id] / total_px * 100))
def save_model(self): if self.distributed_training is True: if hvd.rank() == 0: if self.use_noise is True: self.model.save('model_hvd_bw_%d_B0_with_noise_n_p_%d.h5' % (self.input_shape[0], hvd.size())) else: self.model.save('model_hvd_bw_%d_B0_no_noise_%d_nodes.h5' % (self.input_shape[0], hvd.size())) else: if self.use_noise is True: self.model.save('model_bw_%d_B0_with_noise.h5' % (self.input_shape[0])) else: self.model.save('model_bw_%d_B0_no_noise.h5' % (self.input_shape[0]))
def setup_generators(params): # train/valid splits train_split = '{}/index/{}/{}'.format(params.tag, 'LogD', SplitTypes.train) valid_split = '{}/index/{}/{}'.format(params.tag, 'LogD', SplitTypes.valid) # outputs if params.do_prop_pred: normalize_y = True regression_prediction_columns = params.prop_add output_datasets = ['{}/data/values/{}'.format(params.tag, x) \ for x in regression_prediction_columns] if hvd.rank() == 0: for ix, val in enumerate(output_datasets): print('regression output:', val) else: normalize_y = [] regression_prediction_columns = [] output_datasets = [] # inputs input_datasets = ['{}/data/one_hot/{}'.format(params.tag, x) \ for x in ['smiles']] # setup generators train_gen = DatasetGeneratorFast( h5store=params.hdf5_file_path, vae_params={'hidden_dim': params.hidden_dim}, batch_size=params.batch_size, xlabel=input_datasets, ylabel=output_datasets, normalize_X=False, normalize_y=normalize_y, splitlabel=train_split) valid_gen = DatasetGeneratorFast( h5store=params.hdf5_file_path, vae_params={'hidden_dim': params.hidden_dim}, batch_size=params.batch_size, xlabel=input_datasets, ylabel=output_datasets, normalize_X=False, normalize_y=normalize_y, splitlabel=valid_split) return train_gen, valid_gen
def save_model(self): if self.distributed_training is True: if hvd.rank() == 0: if self.noise_stddev > 0 is True: self.model.save('model_%d_%s_noise_np_%d.h5' % (self.input_shape[0], self.base_model_name, hvd.size())) else: self.model.save('model_%d_%s_np_%d.h5' % (self.input_shape[0], self.base_model_name, hvd.size())) else: if self.noise_stddev > 0 is True: self.model.save('model_%d_%s_noise.h5' % (self.input_shape[0], self.base_model_name)) else: self.model.save('model_%d_%s.h5' % (self.input_shape[0], self.base_model_name))
def data_generator(file_path, batch_size, seq_len=512, predict=False): # Trick the code into thinking we're only running 1 process for prediction when running `Metrics`. if predict: size = 1 else: size = hvd.size() total_batch_size = batch_size * size print(total_batch_size) rank = hvd.rank() print(rank) range_start = batch_size * rank range_end = range_start + batch_size print(range_start, range_end) while True: with xopen(file_path, "rt") as f: _, label_dim = json.loads(f.readline()) text = [] labels = [] for line in f: if len(text) == total_batch_size: text = text[range_start:range_end] labels = labels[range_start:range_end] print(text[0]) # Fun fact: the 2 inputs must be in a list, *not* a tuple. Why. yield ([np.asarray(text), np.zeros_like(text)], np.asarray(labels)) text = [] labels = [] line = json.loads(line) # First sublist is token ids. text.append(np.asarray(line[0])[0:seq_len]) # Second sublist is positive label indices. label_line = np.zeros(label_dim, dtype='b') label_line[line[1]] = 1 labels.append(label_line) # Yield what is left as the last batch when file has been read to its end. # Split the remaining examples, duplicating with `ceil()` if they don't split evenly. leftover_batch_start = ceil(len(text) / size) * rank leftover_batch_end = leftover_batch_start + ceil(len(text) / size) text = text[leftover_batch_start:leftover_batch_end] labels = labels[leftover_batch_start:leftover_batch_end] yield ([np.asarray(text), np.zeros_like(text)], np.asarray(labels))
def main(use_horovod: bool, gpus: int, checkpoint: int, config_path: str) -> None: config = process_config(config_path, use_horovod, gpus, checkpoint) # create tensorflow session and set as keras backed tf_config = tf.ConfigProto() if config.trainer.use_horovod: import horovod.keras as hvd hvd.init() tf_config.gpu_options.allow_growth = True tf_config.gpu_options.visible_device_list = str(hvd.local_rank()) is_master = not config.trainer.use_horovod if not is_master: import horovod.keras as hvd is_master = hvd.rank() == 0 if is_master and not os.path.exists(config.exp.source_dir): # copy source files shutil.copytree( os.path.abspath(os.path.curdir), config.exp.source_dir, ignore=lambda src, names: {"datasets", "__pycache__", ".git", "experiments", "venv"}) tf_sess = tf.Session(config=tf_config) K.set_session(tf_sess) data_loader = get_data_loader(config=config) model, trainer = build_model_and_trainer(config, data_loader) print(f"Start Training Experiment {config.exp.name}") try: trainer.train() except Exception as e: send_noti_to_telegram( f"an exception raised on training {config.exp.name}") raise e
def setup_tf_config(config: DotMap): tf_config = tf.ConfigProto() if config.trainer.use_horovod: import horovod.keras as hvd hvd.init() tf_config.gpu_options.allow_growth = True tf_config.gpu_options.visible_device_list = str(hvd.local_rank()) is_master = not config.trainer.use_horovod if not is_master: import horovod.keras as hvd is_master = hvd.rank() == 0 tf_sess = tf.Session(config=tf_config) K.set_session(tf_sess) return is_master
def batch_generator(full_sequences, fragment_length, batch_size, fragment_stride, nb_output_bins, randomize_batch_order, _rnd): indices = list( fragment_indices(full_sequences, fragment_length, batch_size, fragment_stride, nb_output_bins)) global g_multi_gpu if g_multi_gpu: import horovod.keras as hvd gpu_count = hvd.size() current_gpu = hvd.rank() else: gpu_count = 1 current_gpu = 0 if randomize_batch_order: _rnd.shuffle(indices) batches_parted = [batch for batch in partition_all(batch_size, indices)] start_index = len(batches_parted) // gpu_count * current_gpu batches_gpu = batches_parted[start_index:] batches = cycle(batches_gpu) for batch in batches: if len(batch) < batch_size: continue yield np.array([ one_hot(full_sequences[e[0]][e[1]:e[1] + fragment_length]) for e in batch ], dtype='uint8'), np.array([ one_hot(full_sequences[e[0]][e[1] + 1:e[1] + fragment_length + 1]) for e in batch ], dtype='uint8')
def GetCallbacks(logfileoutputdir, stage): logdir = logfileoutputdir + "/" + stage filename = logfileoutputdir + "/" + stage + "/modelunet.h5" logname = logfileoutputdir + "/" + stage + "/log.csv" if options.with_hvd: callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), hvd.callbacks.MetricAverageCallback(), hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1), keras.callbacks.TerminateOnNaN() ] if hvd.rank() == 0: callbacks += [ keras.callbacks.ModelCheckpoint(filepath=filename, verbose=1, save_best_only=True), keras.callbacks.CSVLogger(logname), keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=0, write_graph=True, write_images=False) ] else: callbacks = [ keras.callbacks.TerminateOnNaN(), keras.callbacks.CSVLogger(logname), keras.callbacks.ModelCheckpoint(filepath=filename, verbose=1, save_best_only=True), keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=0, write_graph=True, write_images=False) ] return callbacks, filename
def init_callbacks(self) -> None: if self.config.trainer.use_lr_decay: # linear decay from the half of max_epochs def lr_scheduler(lr, epoch, max_epochs): return min(lr, 2 * lr * (1 - epoch / max_epochs)) self.model_callbacks["combined"].append( LearningRateScheduler(schedule=lambda epoch: lr_scheduler(self.config.model.generator.lr, epoch, self.config.trainer.num_epochs))) for model_name in ['d_x', 'd_y']: self.model_callbacks[model_name].append( LearningRateScheduler(schedule=lambda epoch: lr_scheduler(self.config.model.discriminator.lr, epoch, self.config.trainer.num_epochs))) # if horovod used, only worker 0 saves checkpoints is_master = True is_local_master = True if self.config.trainer.use_horovod: import horovod.keras as hvd is_master = hvd.rank() == 0 is_local_master = hvd.local_rank() == 0 # horovod callbacks if self.config.trainer.use_horovod: import horovod.keras as hvd self.model_callbacks["combined"].append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) self.model_callbacks["combined"].append(hvd.callbacks.MetricAverageCallback()) self.model_callbacks["combined"].append( hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) if is_local_master: # model saver self.model_callbacks["serial_combined"].append( ModelCheckpointWithKeepFreq( filepath=os.path.join(self.config.exp.checkpoints_dir, "{epoch:04d}-combined.hdf5"), keep_checkpoint_freq=self.config.trainer.keep_checkpoint_freq, save_checkpoint_freq=self.config.trainer.save_checkpoint_freq, save_best_only=False, save_weights_only=True, verbose=1)) # save optimizer weights for model_name in ['combined', 'd_x', 'd_y']: self.model_callbacks[model_name].append(OptimizerSaver(self.config, model_name)) if is_master: # save individual models for model_name in ['g_xy', 'g_yx', 'd_x', 'd_y']: self.model_callbacks[model_name].append( ModelSaver( checkpoint_dir=self.config.exp.checkpoints_dir, keep_checkpoint_freq=self.config.trainer.keep_checkpoint_freq, model_name=model_name, num_epochs=self.config.trainer.num_epochs, verbose=1)) # send notification to telegram channel on train start and end self.model_callbacks["combined"].append(TrainProgressAlertCallback(experiment_name=self.config.exp.name, total_epochs=self.config.trainer.num_epochs)) # tensorboard callback self.model_callbacks["combined"].append( ScalarCollageTensorBoard(log_dir=self.config.exp.tensorboard_dir, batch_size=self.config.trainer.batch_size, write_images=True)) # initialize callbacks by setting model and params epochs = self.config.trainer.num_epochs steps_per_epoch = self.data_loader.get_train_data_size() // self.config.trainer.batch_size for model_name in self.model_callbacks: model = eval(f"self.{model_name}") callbacks = self.model_callbacks[model_name] for callback in callbacks: callback.set_model(model) callback.set_params({ "batch_size": self.config.trainer.batch_size, "epochs": epochs, "steps": steps_per_epoch, "samples": self.data_loader.get_train_data_size(), "verbose": True, "do_validation": False, "model_name": model_name, })
if args.keras_api: import keras as K else: from tensorflow import keras as K CHANNELS_LAST = True hvd.init() os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" # Get rid of the AVX, SSE warnings os.environ["OMP_NUM_THREADS"] = str(args.intraop_threads) os.environ["KMP_BLOCKTIME"] = str(args.blocktime) os.environ["KMP_AFFINITY"] = "granularity=thread,compact,1,0" if (hvd.rank() == 0): # Only print on worker 0 print_summary = args.print_model verbose = 1 # os.system("lscpu") #os.system("uname -a") print("TensorFlow version: {}".format(tf.__version__)) print("Intel MKL-DNN is enabled = {}".format( tf.pywrap_tensorflow.IsMklEnabled())) print("Keras API version: {}".format(K.__version__)) else: # Don't print on workers > 0 print_summary = 0 verbose = 0 # Horovod needs to have every worker do the same amount of work. # Otherwise it will complain at the end of the epoch when # worker 0 takes more time than the others to do validation,
# If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') # Horovod: print logs on the first worker. verbose = 1 if hvd.rank() == 0 else 0 # Input image dimensions img_rows, img_cols = 28, 28 num_classes = 10 # Load Fashion MNIST data. (x_train, y_train), (x_test, y_test) = load_data(args.dataset_path) if K.image_data_format() == 'channels_first': x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) input_shape = (1, img_rows, img_cols) else: x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
# Horovod: adjust learning rate based on number of GPUs. opt = keras.optimizers.Adadelta(1.0 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) model.fit(x_train, y_train, batch_size=batch_size, callbacks=callbacks, epochs=epochs, verbose=1, validation_data=(x_test, y_test)) score = model.evaluate(x_test, y_test, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1])
def test_load_model_broadcast(self): hvd.init() def create_model(): opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3,))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') return model with self.test_session() as sess: K.set_session(sess) model = create_model() x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) if hvd.rank() == 0: _, fname = tempfile.mkstemp('.h5') model.save(fname) K.clear_session() with self.test_session() as sess: K.set_session(sess) if hvd.rank() == 0: model = hvd.load_model(fname) os.remove(fname) else: model = create_model() def generator(): while 1: yield (x, y) if hvd.rank() == 0: self.assertEqual(len(model.optimizer.weights), 5) else: self.assertEqual(len(model.optimizer.weights), 0) # No assertions, we just need to verify that it doesn't hang callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)] model.fit_generator(generator(), steps_per_epoch=10, callbacks=callbacks, epochs=0, verbose=0, workers=4, initial_epoch=1) self.assertEqual(len(model.optimizer.weights), 5)
checkpoint_format = './checkpoint-{epoch}.h5' log_dir = './logs' # If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 for try_epoch in range(epochs, 0, -1): if os.path.exists(checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') # Horovod: print logs on the first worker. verbose = 1 if hvd.rank() == 0 else 0 # Training data iterator. train_gen = image.ImageDataGenerator( width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True, preprocessing_function=keras.applications.resnet50.preprocess_input) train_iter = train_gen.flow_from_directory(train_dir, batch_size=batch_size, target_size=(224, 224)) # Validation data iterator. test_gen = image.ImageDataGenerator( zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input) test_iter = test_gen.flow_from_directory(test_dir, batch_size=batch_size, target_size=(224, 224)) # Set up standard ResNet-50 model.