def build_dataset(self): """ Dataset for train or evaluate :return: Return dataset for train or eval """ ds_train = get_dataset(self.config, is_training=True, num_shards=hvd.size(), shard_index=hvd.rank()) self.train_steps_per_epoch = ds_train.steps_per_epoch self.train_steps_per_epoch = self.train_steps_per_epoch // hvd.size() train_dataset = ds_train.build() ds_eval = get_dataset(self.config, is_training=False) self.eval_steps_per_epoch = ds_eval.steps_per_epoch eval_dataset = ds_eval.build() train_dataset_distill = None eval_dataset_distill = None if self.config.get_attribute( "scheduler") == "distill" or self.config.get_attribute( 'is_distill'): ds_train_distill = get_dataset(self.config, is_training=True, num_shards=hvd.size(), shard_index=hvd.rank()) train_dataset_distill = ds_train_distill.build(True) ds_eval_distill = get_dataset(self.config, is_training=False) eval_dataset_distill = ds_eval_distill.build(True) return train_dataset, eval_dataset, train_dataset_distill, eval_dataset_distill
def compute_expected_value( batch_id: int, aggregation_frequency: int, multiplier: float, average_aggregated_gradient: bool, tf2: bool, ) -> float: """ Compute the expected value based on how we are aggregating gradients. """ gradients_aggregated = (batch_id + 1) % aggregation_frequency == 0 if gradients_aggregated: all_reduced_grads = 0.0 for _ in range(aggregation_frequency): grads_for_batch = 0.0 for rank in range(hvd.size()): grads_for_batch += rank * multiplier if average_aggregated_gradient: grads_for_batch /= float(aggregation_frequency) all_reduced_grads += grads_for_batch / float(hvd.size()) return all_reduced_grads else: non_aggregated_grads = hvd.rank() * multiplier if tf2: # In Tf2 we return the sum of the locally aggregated gradients. non_aggregated_grads *= (batch_id + 1) % aggregation_frequency return non_aggregated_grads
def on_train_end(self, logs=None): img_sec_mean = np.mean(self.img_secs) img_sec_conf = 1.96 * np.std(self.img_secs) print('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) print('Total img/sec on %d %s(s): %.1f +-%.1f' % (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))
def validate(self, data_creator, verbose=1, sample_weight=None, steps=None, callbacks=None, data_config=None): """Evaluates the model on the validation data set.""" config = self.config.copy() if data_config is not None: config.update(data_config) if self.backend == "horovod": import horovod.tensorflow.keras as hvd assert "batch_size" in config, "batch_size must be set in config" config["batch_size"] = config["batch_size"] // hvd.size() dataset = data_creator(config) from tensorflow.python.distribute.input_ops import auto_shard_dataset dataset = auto_shard_dataset(dataset, hvd.size(), hvd.rank()) elif self.backend == "tf-distributed": with self.strategy.scope(): dataset = data_creator(config) else: dataset = data_creator(config) if self.backend == "horovod": import horovod.tensorflow.keras as hvd if hvd.rank() != 0: verbose = 0 elif self.backend == "tf-distributed": if self.strategy.cluster_resolver.task_id != 0: verbose = 0 params = dict( verbose=verbose, sample_weight=sample_weight, steps=steps, callbacks=callbacks, ) results = self.model.evaluate(dataset, **params) if results is None: # Using local Model since model.evaluate() returns None # for MultiWorkerMirroredStrategy logger.warning("Running a local model to get validation score.") self.local_model = self.model_creator(self.config) self.local_model.set_weights(self.model.get_weights()) results = self.local_model.evaluate(dataset, **params) if isinstance(results, list): stats = { "validation_" + k: v for k, v in zip(self.model.metrics_names, results) } else: stats = {"results": results} return stats
def input_fn(is_training, data_dir, batch_size, dtype, num_epochs=1, datasets_num_private_threads=None, num_parallel_batches=5, ): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features datasets_num_private_threads: Number of private threads for tf.data. num_parallel_batches: Number of parallel batches for tf.data. parse_record_fn: Function to use for parsing the records. Returns: A dataset that can be used for iteration. """ filenames = get_filenames(is_training, data_dir) labels = get_labels(is_training) dataset = tf.data.Dataset.from_tensor_slices((filenames,labels)) # shard the dataset if it makes sense if hvd.size()>1: print('Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d' % ( hvd.rank(), hvd.size())) dataset = dataset.shard(hvd.size(), hvd.rank()) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_SHUFFLE_BUFFER) # Convert to individual records. # cycle_length = 10 means 10 files will be read and deserialized in parallel. # This number is low enough to not cause too much contention on small systems # but high enough to provide the benefits of parallelization. You may want # to increase this number if you have a large number of CPU cores. #dataset = dataset.apply(tf.data.experimental.parallel_interleave( # tf.data.TFRecordDataset, cycle_length=10)) return process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=_SHUFFLE_BUFFER, parse_record_fn=record_parser, num_epochs=num_epochs, dtype=dtype, datasets_num_private_threads=datasets_num_private_threads, num_parallel_batches=num_parallel_batches )
def step(self, data_creator, epochs=1, verbose=1, callbacks=None, validation_data_creator=None, class_weight=None, steps_per_epoch=None, validation_steps=None, validation_freq=1): """Runs a training epoch and updates the model parameters.""" train_dataset = data_creator(self.config) if validation_data_creator is not None: test_dataset = validation_data_creator(self.config) else: test_dataset = None if self.backend == "horovod": import horovod.tensorflow.keras as hvd from tensorflow.python.distribute.input_ops import auto_shard_dataset train_dataset = auto_shard_dataset(train_dataset, hvd.size(), hvd.rank()) if test_dataset is not None: test_dataset = auto_shard_dataset(test_dataset, hvd.size(), hvd.rank()) if self.backend == "horovod": import horovod.tensorflow.keras as hvd hvd_callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), hvd.callbacks.MetricAverageCallback() ] if hvd.rank() != 0: verbose = 0 if callbacks is not None: callbacks = hvd_callbacks + callbacks else: callbacks = hvd_callbacks history = self.model.fit(train_dataset, epochs=self.epoch + epochs, verbose=verbose, callbacks=callbacks, validation_data=test_dataset, class_weight=class_weight, initial_epoch=self.epoch, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, validation_freq=validation_freq) if history is None: stats = {} else: stats = {"train_" + k: v[-1] for k, v in history.history.items()} self.epoch += epochs return stats
def is_initialized(): """ Checks if horovod is initialized. :return: bool """ try: hvd.size() except ValueError: return False return True
def adapt_optimizer(opt): if ('str' == opt.__class__.__name__): opt = get_optimizer_by_name(opt) opt_config = opt.get_config() try: opt_config['learning_rate'] *= hvd.size() except KeyError: opt_config['lr'] *= hvd.size() return hvd.DistributedOptimizer(opt.from_config(opt_config))
def train_hvd(learning_rate=1.0): # Tensorflow has given up on pickling. We need to explicitly import its modules inside workers from tensorflow.keras import backend as K from tensorflow.keras.models import Sequential import tensorflow as tf from tensorflow import keras import horovod.tensorflow.keras as hvd # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) (x_train, y_train), (x_test, y_test) = get_dataset(num_classes, hvd.rank(), hvd.size()) model = get_model(num_classes) # Horovod: adjust learning rate based on number of GPUs. optimizer = keras.optimizers.Adadelta(lr=learning_rate * hvd.size()) # Horovod: add Horovod Distributed Optimizer. optimizer = hvd.DistributedOptimizer(optimizer) model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint(checkpoint_dir + '/checkpoint-{epoch}.ckpt', save_weights_only=True)) model.fit(x_train, y_train, batch_size=batch_size, callbacks=callbacks, epochs=epochs, verbose=2, validation_data=(x_test, y_test))
def compute_expected_value(batch_id): sum_per_aggregation = 0.0 for _ in range(backward_passes_per_step): grads_for_batch = 0.0 for rank in range(hvd.size()): grads_for_batch += rank # Apply `average_aggregated_gradients`. grads_for_batch /= float(backward_passes_per_step) # Averages across workers. sum_per_aggregation += grads_for_batch / float(hvd.size()) aggregations_completed = math.floor((batch_id + 1) / backward_passes_per_step) return aggregations_completed * sum_per_aggregation
def _compile_graph(self, model, loss_func='mse', opt_func='adam'): loss_functions = {'mse':'mean_squared_error', \ 'msle':'mean_squared_logarithmic_error', \ 'cc':'categorical_crossentropy', \ 'bce':'binary_crossentropy'} #'bce':BinaryCrossentropy()} #'scc':'sparse_categorical_crossentropy'} - wants a single output opt_functions = {'adam': Adam, 'sgd': SGD, 'rms': RMSprop} logger.debug( "Using the %s optimizer with a learning rate of %s and the %s loss function" % (opt_func, str(self.learning_rate), loss_func)) if hvd: opt = opt_functions[opt_func](lr=self.learning_rate * hvd.size()) if hvd.rank() == 0: logger.debug("Compiling distributed optimizer") opt = hvd.DistributedOptimizer(opt) self.callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0) ] else: opt = opt_functions[opt_func](lr=self.learning_rate) # compile model.compile(loss=loss_functions[loss_func], optimizer=opt, metrics=['accuracy']) #model.summary() plot_model(model, to_file=os.path.join(self.save_dir, '%s.png' % (self.param_name)))
def create_config(args): assert not (args.cpu and args.amp ), "Automatic mixed precision conversion works only with GPU" assert (not args.benchmark or args.benchmark_warmup_steps < args.benchmark_steps ), "Number of benchmark steps must be higher than warmup steps" logger = logging.getLogger("tensorflow") if args.cpu: init_cpu(args, logger) else: init_gpu(args, logger) num_gpus = 1 if args.cpu else hvd.size() train_batch_size = args.global_batch_size // num_gpus eval_batch_size = args.eval_batch_size // num_gpus train_paths = sorted(glob.glob(args.train_data_pattern)) valid_paths = sorted(glob.glob(args.eval_data_pattern)) train_spec_input_fn = train_input_fn( train_paths=train_paths, records_batch_size=train_batch_size, ) eval_spec_input_fn = eval_input_fn(valid_paths=valid_paths, records_batch_size=eval_batch_size) config = { "train_dataset": train_spec_input_fn, "eval_dataset": eval_spec_input_fn, } return config
def define_model(): model = models.Sequential() model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same')) model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Dropout(0.2)) model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same')) model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Dropout(0.2)) model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same')) model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Dropout(0.2)) model.add(layers.Flatten()) model.add(layers.Dense(128, activation='relu')) model.add(layers.Dropout(0.2)) model.add(layers.Dense(10, activation='softmax')) scaled_lr = 0.001 * hvd.size() opt = tf.optimizers.Adam(scaled_lr) opt = hvd.DistributedOptimizer(opt, backward_passes_per_step=1, average_aggregated_gradients=True) model.compile( optimizer=opt, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'], experimental_run_tf_function=False) return model
def get_model(input_shape, learning_rate, weight_decay, optimizer, momentum, hvd): input_tensor = Input(shape=input_shape) base_model = keras.applications.resnet50.ResNet50( include_top=False, weights=None, input_tensor=input_tensor, input_shape=input_shape, classes=None) x = Flatten()(base_model.output) predictions = Dense(NUM_CLASSES, activation='softmax')(x) model = Model(inputs=base_model.input, outputs=predictions) size = hvd.size() if optimizer.lower() == 'sgd': opt = SGD(lr=learning_rate * size, decay=weight_decay, momentum=momentum) elif optimizer.lower() == 'rmsprop': opt = RMSprop(lr=learning_rate * size, decay=weight_decay) else: opt = Adam(lr=learning_rate * size, decay=weight_decay) opt = hvd.DistributedOptimizer(opt) model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) return model
def get_gpu_num(): """ Returns the number of supported GPUs. :return: num of gpus """ if hvd is None or not is_initialized(): return 1 return hvd.size()
def init_gpu(args, logger): hvd.init() init_logger( full=hvd.rank() == 0, args=args, logger=logger ) if args.affinity != 'disabled': gpu_id = hvd.local_rank() affinity = set_affinity( gpu_id=gpu_id, nproc_per_node=hvd.size(), mode=args.affinity ) logger.warning(f'{gpu_id}: thread affinity: {affinity}') gpus = tf.config.experimental.list_physical_devices('GPU') tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') if args.amp: policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16") tf.keras.mixed_precision.experimental.set_policy(policy) if args.xla: tf.config.optimizer.set_jit(True)
def init_hvd(args): if hvd: hvd.init() FORMAT = "[%%(levelname)s - P%i/%i - %%(filename)s:%%(lineno)s - %%(funcName)s] %%(message)s" % ( hvd.rank(), hvd.size()) # Remove all handlers associated with the root logger object. for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(level=logging.INFO, format=FORMAT) if args.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) logger.debug("Updated logger to print process") args.hvd_rank = hvd.rank() if hvd else 0 args.hvd_size = hvd.size() if hvd else 1
def train(model, dataset, epoch, initial_lr): # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow # uses hvd.DistributedOptimizer() to compute gradients. model.compile( loss=tf.losses.SparseCategoricalCrossentropy(), optimizer=opt, metrics=["accuracy"], experimental_run_tf_function=False, ) callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), hvd.callbacks.MetricAverageCallback(), hvd.callbacks.LearningRateWarmupCallback(initial_lr, warmup_epochs=3, verbose=1), ] if hvd.rank() == 0: callbacks.append( tf.keras.callbacks.ModelCheckpoint("checkpoint-{epoch}.h5")) verbose = 1 if hvd.rank() == 0 else 0 model.fit( dataset, steps_per_epoch=500 // hvd.size(), callbacks=callbacks, epochs=epoch, verbose=verbose, )
def train(state): # Horovod: adjust number of steps based on number of GPUs. state.model.fit(dataset, steps_per_epoch=500 // hvd.size(), callbacks=callbacks, epochs=epochs - state.epoch, verbose=1 if hvd.rank() == 0 else 0)
def create_resnet(): # Build network import keras_resnet_single as networks resnet = networks.ResNet.build( len(channels), resblocks, [16, 32], (125 * granularity, 125 * granularity, len(channels)), granularity) # Load saved weights, if indicated if args.load_epoch != 0: directory = args.save_dir if args.save_dir == '': directory = expt_name model_name = glob.glob('../MODELS/%s/epoch%02d-*.hdf5' % (directory, args.load_epoch))[0] #assert len(model_name) == 2 #model_name = model_name[0].split('.hdf5')[0]+'.hdf5' print('Loading weights from file:', model_name) resnet.load_weights(model_name) #opt = keras.optimizers.Adam(lr=lr_init, epsilon=1.e-5) # changed eps to match pytorch value #opt = keras.optimizers.SGD(lr=lr_init * hvd.size()) opt = NovoGrad(learning_rate=lr_init * hvd.size()) #Wrap the optimizer in a Horovod distributed optimizer -> uses hvd.DistributedOptimizer() to compute gradients. opt = hvd.DistributedOptimizer(opt) #For Horovod: We specify `experimental_run_tf_function=False` to ensure TensorFlow #resnet.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'], experimental_run_tf_function = False) #resnet.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) resnet.summary() return resnet
def test_train_model_lr_schedule(self): lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( 0.001 * hvd.size(), decay_steps=100000, decay_rate=0.96, staircase=True) opt = tf.keras.optimizers.Adam(lr_schedule) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3,))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.ThresholdedReLU(0.5)) model.compile(loss=keras.losses.mean_squared_error, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], experimental_run_tf_function=False) x = np.random.random((1, 3)) y = np.random.random((1, 3, 2)) # No assertions, we just need to verify that it doesn't hang or error callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)] model.fit(x, y, steps_per_epoch=10, callbacks=callbacks, epochs=1)
def init_workers(distributed=False): """Initialize distributed worker""" rank, local_rank, n_ranks = 0, 0, 1 if distributed: hvd.init() rank, local_rank, n_ranks = hvd.rank(), hvd.local_rank(), hvd.size() return rank, local_rank, n_ranks
def train(model, dataset, epochs, steps_per_epoch, hvd_rank=0, hvd_size=1): scaled_lr = 0.001 * hvd.size() callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restore dfrom a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=1, verbose=1), ] if hvd.rank() == 0: callbacks.append( tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) # Horovod: write logs on worker 0. verbose = 1 if hvd_rank == 0 else 0 model.fit(dataset, epochs=epochs, steps_per_epoch=steps_per_epoch // hvd_size, callbacks=callbacks, verbose=verbose, validation_data=get_test_dataset()) return model
def create_model(): model = models.Sequential() model.add( layers.Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3))) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Conv2D(64, (3, 3), activation='relu')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Conv2D(128, (3, 3), activation='relu')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Flatten()) model.add(layers.Dense(64, activation='relu')) model.add(layers.Dense(1, activation='sigmoid')) # Horovod: adjust learning rate based on number of GPUs. opt = optimizers.SGD(0.01 * hvd.size()) # Horovod: add Horovod DistributedOptimizer. opt = hvd.DistributedOptimizer(opt) # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow # uses hvd.DistributedOptimizer() to compute gradients. model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'], experimental_run_tf_function=False) return model
def __init__(self, data_dir, index_file_dir, split='train', num_classes=None, image_size=224, num_channels=3, batch_size=128, dtype='float32', one_hot=False, use_dali=False, augmenter=None, shuffle_buffer_size=10000, file_shuffle_buffer_size=1024, cache=False, mean_subtract=False, standardize=False, augmenter_params=None, cutmix_alpha=0.0, mixup_alpha=0.0, defer_img_mixing=True, hvd_size=None, disable_map_parallelization=False): """Initialize the builder from the config.""" if not os.path.exists(data_dir): raise FileNotFoundError( 'Cannot find data dir: {}'.format(data_dir)) if one_hot and num_classes is None: raise FileNotFoundError( 'Number of classes is required for one_hot') self._data_dir = data_dir self._split = split self._image_size = image_size self._num_classes = num_classes self._num_channels = num_channels self._batch_size = batch_size self._dtype = dtype self._one_hot = one_hot self._augmenter_name = augmenter self._shuffle_buffer_size = shuffle_buffer_size self._file_shuffle_buffer_size = file_shuffle_buffer_size self._cache = cache self._mean_subtract = mean_subtract self._standardize = standardize self._index_file = index_file_dir self._use_dali = use_dali self.mixup_alpha = mixup_alpha self.cutmix_alpha = cutmix_alpha self.defer_img_mixing = defer_img_mixing self.disable_map_parallelization = disable_map_parallelization self._num_gpus = hvd.size() if not hvd_size else hvd_size if self._augmenter_name is not None: augmenter = AUGMENTERS.get(self._augmenter_name, None) params = augmenter_params or {} self._augmenter = augmenter( **params) if augmenter is not None else None else: self._augmenter = None
def test_elastic_state(self): v = 1.0 if hvd.rank() == 0 else 2.0 model1 = tf.keras.Sequential( [tf.keras.layers.Dense(2, activation='softmax')]) model1.build((2, 2)) model1.set_weights([ np.array([[v, v], [v, v]], dtype=np.float32), np.array([v, v], dtype=np.float32) ]) model2 = tf.keras.Sequential( [tf.keras.layers.Dense(2, activation='softmax')]) model2.build((2, 2)) model2.set_weights([ np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), np.array([0.0, 0.0], dtype=np.float32) ]) optimizer = tf.optimizers.Adam(0.001 * hvd.size()) state = hvd.elastic.KerasState(model1, optimizer, batch=20 + hvd.rank(), epoch=10 + hvd.rank()) state.sync() model1_weights = model1.get_weights() model2_weights = model2.get_weights() # After sync, all values should match the root rank for w in state.model.get_weights(): self.assertAllClose(w, np.ones_like(w)) assert state.batch == 20 assert state.epoch == 10 # Partially modify then restore model1.set_weights(model2_weights) state.batch = 21 state.epoch = 11 state.restore() for w1, w2 in zip(model1.get_weights(), model1_weights): self.assertAllClose(w1, w2) assert state.batch == 20 assert state.epoch == 10 # Partially modify then commit model1.set_weights(model2_weights) state.batch = 21 state.epoch = 11 state.commit() state.restore() for w1, w2 in zip(model1.get_weights(), model2_weights): self.assertAllClose(w1, w2) assert state.batch == 21 assert state.epoch == 11
def build(self) -> tf.data.Dataset: """Construct a dataset end-to-end and return it. Args: input_context: An optional context provided by `tf.distribute` for cross-replica training. Returns: A TensorFlow dataset outputting batched images and labels. """ if self._use_dali: print("Using dali for {train} dataloading".format( train="training" if self.is_training else "validation")) tfrec_filenames = sorted( tf.io.gfile.glob( os.path.join(self._data_dir, '%s-*' % self._split))) tfrec_idx_filenames = sorted( tf.io.gfile.glob( os.path.join(self._index_file, '%s-*' % self._split))) # # Create pipeline dali_pipeline = Dali.DaliPipeline( tfrec_filenames=tfrec_filenames, tfrec_idx_filenames=tfrec_idx_filenames, height=self._image_size, width=self._image_size, batch_size=self.local_batch_size, num_threads=1, device_id=hvd.local_rank(), shard_id=hvd.rank(), num_gpus=hvd.size(), num_classes=self.num_classes, deterministic=False, dali_cpu=False, training=self.is_training) # Define shapes and types of the outputs shapes = ((self.local_batch_size, self._image_size, self._image_size, 3), (self.local_batch_size, self._num_classes)) dtypes = (tf.float32, tf.float32) # Create dataset dataset = dali_tf.DALIDataset(pipeline=dali_pipeline, batch_size=self.local_batch_size, output_shapes=shapes, output_dtypes=dtypes, device_id=hvd.local_rank()) # if self.is_training and self._augmenter: # print('Augmenting with {}'.format(self._augmenter)) # dataset.unbatch().map(self.augment_pipeline, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(self.local_batch_size) return dataset else: print("Using tf native pipeline for {train} dataloading".format( train="training" if self.is_training else "validation")) dataset = self.load_records() dataset = self.pipeline(dataset) return dataset
def __init__(self, config: Seq2SeqConfig): """ Initialize model for training. :param config: seq2seq config from input data """ self.body_count = config.body_count self.max_body_length = config.max_body_length self.subject_count = config.subject_count self.max_subject_length = config.max_subject_length self.body_word_to_index = config.body_word_to_index self.body_index_to_word = config.body_index_to_word self.subject_word_to_index = config.subject_word_to_index self.subject_index_to_word = config.subject_index_to_word self.config = config.__dict__ encoder_inputs: Input = Input(shape=(None,), name="encoder_inputs") encoder_embedding: Embedding = Embedding( input_dim=self.body_count, output_dim=self.hidden_units, input_length=self.max_body_length, name="encoder_embedding", ) encoder_lstm: LSTM = LSTM(units=self.hidden_units, return_state=True, name="encoder_lstm") _, encoder_hidden_state, encoder_cell_state = encoder_lstm(encoder_embedding(encoder_inputs)) encoder_states: List[np.ndarray] = [encoder_hidden_state, encoder_cell_state] decoder_inputs: Input = Input(shape=(None, self.subject_count), name="decoder_inputs") decoder_lstm: LSTM = LSTM( units=self.hidden_units, return_state=True, return_sequences=True, name="decoder_lstm" ) decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) decoder_dense = Dense(units=self.subject_count, activation="softmax", name="decoder_dense") decoder_outputs = decoder_dense(decoder_outputs) # Horovod: add Horovod Distributed Optimizer. try: optimizer = RMSprop(1.0 * hvd.size()) optimizer = hvd.DistributedOptimizer(optimizer) except ValueError: print("Running outside Horovod.") optimizer = RMSprop(1.0) model: Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) model.compile( loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"], experimental_run_tf_function=False, ) self.model = model self.encoder_model = Model(encoder_inputs, encoder_states) decoder_state_inputs: List[Input] = [Input(shape=(self.hidden_units,)), Input(shape=(self.hidden_units,))] decoder_outputs, hidden_state, cell_state = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) decoder_states: List[Dense] = [hidden_state, cell_state] decoder_outputs = decoder_dense(decoder_outputs) self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)
def train(state): # Horovod: adjust number of steps based on number of GPUs and number of epochs # based on the number of previously completed epochs. state.model.fit(dataset, steps_per_epoch=args.batches_per_epoch // hvd.size(), callbacks=callbacks, epochs=epochs - state.epoch, verbose=1 if hvd.rank() == 0 else 0)
def init_workers(distributed=False): if distributed: hvd.init() return SimpleNamespace(rank=hvd.rank(), size=hvd.size(), local_rank=hvd.local_rank(), local_size=hvd.local_size()) else: return SimpleNamespace(rank=0, size=1, local_rank=0, local_size=1)
args, _ = parser.parse_known_args() # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) batch_size = 128 num_classes = 10 # Horovod: adjust number of epochs based on number of GPUs. epochs = int(math.ceil(12.0 / hvd.size())) # Input image dimensions img_rows, img_cols = 28, 28 # The data, shuffled and split between train and test sets x_train = np.load(os.path.join(args.train, 'train.npz'))['data'] y_train = np.load(os.path.join(args.train, 'train.npz'))['labels'] print("Train dataset loaded from: {}".format(os.path.join(args.train, 'train.npz'))) x_test = np.load(os.path.join(args.test, 'test.npz'))['data'] y_test = np.load(os.path.join(args.test, 'test.npz'))['labels'] print("Test dataset loaded from: {}".format(os.path.join(args.test, 'test.npz')))