def load_model(filepath, compile=True, **kwargs): logger.debug("Load model from file: {}".format(filepath)) keras_model = tf.keras.models.load_model(filepath, compile=compile, **kwargs) # FIXME load models with any type of parallelization strategy logger.warning("Loading model with the default `data parallel` strategy.") tnt_model = tnt.Model(keras_model, parallel_strategy=tnt.ParallelStrategy.DATA) if compile: try: tnt_optimizer = tnt.distributed_optimizers.SynchDistributedOptimizer( keras_model.optimizer, group=tnt_model.group) tnt_model.dist_optimizer = tnt_optimizer tnt_model._set_internal_optimizer(tnt_model.dist_optimizer) tnt_model.compiled = True tnt_model.done_broadcast = True if version_utils.tf_version_below_equal('2.1'): tnt_model.model._experimental_run_tf_function = False logger.info("Set `experimental_run_tf_function` to False.") except: logger.info("The loaded model was not pre-compiled.") tnt_model.barrier.execute() return tnt_model
def test_add_metric(self): tnt_model = tnt.Model(mnist.lenet5_model_generator()) assert tnt_model.metrics == ['loss'] tnt_model.add_metric(tnt_model.output, aggregation='mean', name='metric_name') assert len(tnt_model.metrics) == 2 assert tnt_model.metrics_names == ['loss', 'metric_name']
def test_add_metric(self): tnt_model = tnt.Model(mnist.lenet5_model_generator()) assert tnt_model.metrics == [] tnt_model.add_metric(tnt_model.output, aggregation='mean', name='metric_name') # deprecated after Tf2.2 assert len(tnt_model.metrics) == 1 assert tnt_model.metrics_names == ['metric_name']
def test_save_before_compile(self, model, save_setup, parallel_strategy, check_configuration_identical): tnt_model = tnt.Model(model, parallel_strategy) tnt_model.save(save_setup['save_dir'], tnt_save_all_devices=save_setup['all_devices']) reloaded_tnt_model = tnt.models.load_model(save_setup['save_dir']) assert isinstance(reloaded_tnt_model, keras.Model) check_configuration_identical(reloaded_tnt_model, tnt_model)
def clone_model(model, **kwargs): if isinstance(model, tnt.Model): keras_model = tf.keras.models.clone_model(model.model, **kwargs) logger.info("clone model from instance of tnt.Model") elif isinstance(model, tf.keras.Model): keras_model = tf.keras.models.clone_model(model, **kwargs) logger.info("clone model from instance of tf.keras.Model") else: raise ValueError("[tnt.models.clone_model] `model` needs to be either", "a `tf.keras.Model`, or a `tnt.Model`") return tnt.Model(keras_model)
def test_optimizer_with_name(self, optimizer_name, optimizer_type): tnt_model = tnt.Model(mnist.lenet5_model_generator(), parallel_strategy=tnt.ParallelStrategy.DATA) tnt_model.compile( optimizer=optimizer_name, loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy']) tnt_optimizer = tnt_model.dist_optimizer assert isinstance(tnt_optimizer, tnt.distributed_optimizers.SynchDistributedOptimizer) assert isinstance(tnt_optimizer.underlying_optimizer, optimizer_type)
def model_from_yaml(yaml_string, **kwargs): logger.debug("Load model from yaml") try: keras_model = tf.keras.models.model_from_yaml(yaml_string, **kwargs) # FIXME load models with any type of parallelization strategy logger.warning( "Loading model with the default `data parallel` strategy.") return tnt.Model(keras_model, parallel_strategy=tnt.ParallelStrategy.DATA) except: raise RuntimeError("[tnt.models.model_from_yaml] Cannot load model")
def test_metrics_names_after_fit(self): tnt_model = tnt.Model(mnist.lenet5_model_generator()) tnt_model.compile(optimizer=tf.keras.optimizers.Adam(), loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"]) train_dataset, _, _ = util.load_dataset(mnist.load_mnist_dataset, train_size=24, train_batch_size=24) tnt_model.fit(train_dataset) assert tnt_model.metrics_names == [ "loss", "sparse_categorical_accuracy" ]
def test_reset_metrics(self): tnt_model = tnt.Model(mnist.lenet5_model_generator()) tnt_model.compile(optimizer=tf.keras.optimizers.Adam(), loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"]) train_dataset, _, _ = util.load_dataset(mnist.load_mnist_dataset, train_size=60, train_batch_size=60) tnt_model.fit(train_dataset) assert all(float(m.result()) != 0 for m in tnt_model.metrics) tnt_model.reset_metrics() assert all(float(m.result()) == 0 for m in tnt_model.metrics)
def clone_model(model, **kwargs): if isinstance(model, tnt.strategy.parallel_model.ParallelModel): keras_model = tf.keras.models.clone_model(model.model, **kwargs) logger.info("clone model from instance of tnt.Model") elif isinstance(model, tf.keras.Model): keras_model = tf.keras.models.clone_model(model, **kwargs) logger.info("clone model from instance of tf.keras.Model") else: raise ValueError("[tnt.models.clone_model] `model` needs to be either", "a `tf.keras.Model`, or a `tnt.Model`") # FIXME load models with any type of parallelization strategy logger.warning("Loading model with the default `data parallel` strategy.") return tnt.Model(keras_model, parallel_strategy=tnt.ParallelStrategy.DATA)
def from_config(cls, config, **kwargs): try: keras_model = tf.keras.Sequential.from_config(config, **kwargs) logger.info("Loaded model from `keras.Sequential`.") except: raise RuntimeError( """[tnt.keras.Sequential.from_config] Cannot load model; provided configuration is not a `keras.Sequential` model.""" ) # FIXME load models with any type of parallelization strategy logger.warning( "Loading model with the default `data parallel` strategy.") return tnt.Model(keras_model, parallel_strategy=tnt.ParallelStrategy.DATA)
def test_cifar_alexnet(self, keras_model, optimizer, micro_batch_size, nbatches, ntest_batches): batch_size = micro_batch_size * tnt.get_size() nsamples = nbatches * batch_size (number_epochs, lr) = cifar.get_hyperparams(optimizer) (train_dataset, test_dataset) = util.load_train_test_dataset(cifar.load_cifar_dataset, train_size = nsamples, train_batch_size = batch_size, test_size = ntest_batches * batch_size, test_batch_size = batch_size) if optimizer.__name__ == 'SGD': keras_optimizer = optimizer(learning_rate=lr, momentum=0.9) else: keras_optimizer = optimizer(learning_rate=lr) model = tnt.Model(keras_model()) model.compile(keras_optimizer, loss = keras.losses.SparseCategoricalCrossentropy(), metrics = [keras.metrics.SparseCategoricalAccuracy()]) model.fit(train_dataset, epochs = number_epochs, verbose = 0) results = model.evaluate(test_dataset) util.check_accuracy_greater(results[1], 0.5)
def __init__(self, flags_obj): """Init function of TransformerMain. Args: flags_obj: Object containing parsed flag values, i.e., FLAGS. """ self.flags_obj = flags_obj self.params = tnt_misc.get_model_params(flags_obj.param_set) self.params["train_epochs"] = flags_obj.train_epochs self.params["epochs_between_evals"] = flags_obj.epochs_between_evals self.params["num_sentences"] = flags_obj.num_sentences self.params["num_eval_sentences"] = flags_obj.num_eval_sentences self.params["batch_size"] = flags_obj.batch_size or self.params["default_batch_size"] self.params["data_dir"] = flags_obj.data_dir self.params["vocab_size"] = flags_obj.vocab_size or self.params["vocab_size"] self.params["max_length"] = flags_obj.max_length self.params["decode_batch_size"] = flags_obj.decode_batch_size self.params["decode_max_length"] = flags_obj.decode_max_length self.params["max_io_parallelism"] = ( flags_obj.num_parallel_calls or tf.data.experimental.AUTOTUNE) self.params["use_synthetic_data"] = flags_obj.use_synthetic_data self.params["dtype"] = tf.float32 # Transformer model used both as Tarantella model (in training) and as a serial # model for inference internal_model = transformer.Transformer(self.params, name="transformer_v2") # The train model includes an additional logits layer and a customized loss self.train_model = create_model(internal_model, self.params, is_train = True) # Enable distributed training self.train_model = tnt.Model(self.train_model) # The inference model is wrapped as a different Keras model that does not use labels self.predict_model = create_model(internal_model, self.params, is_train = False)
def test_clone_tnt_model(self, keras_model, parallel_strategy): tnt_model = tnt.Model(keras_model, parallel_strategy) cloned_model = tnt.models.clone_model(tnt_model) util.check_model_configuration_identical(tnt_model, cloned_model)
def test_layers(self): tnt_model = tnt.Model(mnist.lenet5_model_generator()) assert len(tnt_model.layers) == 8 assert tnt_model.layers[1].name == 'conv1'
def model_from_yaml(yaml_string, **kwargs): keras_model = tf.keras.models.model_from_yaml(yaml_string, **kwargs) return tnt.Model(keras_model)
def model_from_json(json_string, **kwargs): keras_model = tf.keras.models.model_from_json(json_string, **kwargs) return tnt.Model(keras_model)
def load_model(filepath, **kwargs): keras_model = tf.keras.models.load_model(filepath, **kwargs) # FIXME: compile tnt.Model before returning return tnt.Model(keras_model)
def main(_): flags_obj = flags.FLAGS # get rank and comm_size rank = tnt.get_rank() comm_size = tnt.get_size() # compute micro batch if the dataset is not automatically distributed by Tarantella if not flags_obj.auto_distributed: batch_size = flags_obj.batch_size // comm_size else: batch_size = flags_obj.batch_size # Load and preprocess datasets (train_dataset, validation_dataset, _) = dataset_utils.get_tnt_cifar10_dataset(45000, 5000, 10000, batch_size) # Create model and wrap it into a Tarantella model model = resnet_model.resnet32(num_classes=10) model = tnt.Model(model) optimizer = get_optimizer(flags_obj.batch_size) model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=(['sparse_categorical_accuracy'])) model.summary() callbacks = [] if flags_obj.enable_tensorboard: callbacks.append( tf.keras.callbacks.TensorBoard(log_dir=flags_obj.model_dir, profile_batch=2)) if flags_obj.profile_runtime: callbacks.append( RuntimeProfiler(batch_size=batch_size, logging_freq=flags_obj.logging_freq, print_freq=flags_obj.print_freq)) if flags_obj.enable_checkpoint_and_export: if flags_obj.model_dir is not None: ckpt_full_path = os.path.join(flags_obj.model_dir, 'model.ckpt-{epoch:04d}') callbacks.append( tf.keras.callbacks.ModelCheckpoint(ckpt_full_path, save_weights_only=True)) logging.info("Start training") kwargs = { 'tnt_distribute_dataset': flags_obj.auto_distributed, 'tnt_distribute_validation_dataset': flags_obj.auto_distributed } history = model.fit(train_dataset, epochs=flags_obj.train_epochs, callbacks=callbacks, validation_data=validation_dataset, validation_freq=flags_obj.epochs_between_evals, verbose=flags_obj.verbose, **kwargs) logging.info("Train history: {}".format(history.history)) kwargs = {'tnt_distribute_dataset': flags_obj.auto_distributed} eval_output = model.evaluate(validation_dataset, verbose=flags_obj.verbose, **kwargs)
def test_non_trainable_weights(self): tnt_model = tnt.Model(mnist.lenet5_model_generator()) assert tnt_model.non_trainable_weights == []
def generate_tnt_model_runner(model): model_data_par = tnt.Model(model) runner = TrainingRunner(model_data_par) return runner
def test_stateful(self): tnt_model = tnt.Model(mnist.lenet5_model_generator()) assert tnt_model.stateful == False
def test_weights(self): tnt_model = tnt.Model(mnist.lenet5_model_generator()) assert len(tnt_model.weights) == 8 # 2 convs, 2 dense + biases
def test_state_updates(self): tnt_model = tnt.Model(mnist.lenet5_model_generator()) assert tnt_model.state_updates == []
def test_losses(self): tnt_model = tnt.Model(mnist.lenet5_model_generator()) assert tnt_model.losses == [] tnt_model.add_loss(tf.abs(tnt_model.output)) assert len(tnt_model.losses) == 1
def test_output(self): tnt_model = tnt.Model(mnist.lenet5_model_generator()) assert tnt_model.output.shape[0] == None assert tnt_model.output.shape[1] == 10
1, ), name='input') x = keras.layers.Conv2D(20, 5, padding="same", activation='relu')(inputs) x = keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) x = keras.layers.Conv2D(50, 5, padding="same", activation='relu')(x) x = keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) x = keras.layers.Flatten()(x) x = keras.layers.Dense(500, activation='relu')(x) outputs = keras.layers.Dense(10, activation='softmax')(x) return keras.Model(inputs=inputs, outputs=outputs) args = parse_args() # Create Tarantella model model = tnt.Model(lenet5_model_generator()) # Compile Tarantella model (as with Keras) model.compile(optimizer=keras.optimizers.SGD(learning_rate=args.learning_rate), loss=keras.losses.SparseCategoricalCrossentropy(), metrics=[keras.metrics.SparseCategoricalAccuracy()]) # Load MNIST dataset (as with Keras) shuffle_seed = 42 (x_train, y_train), (x_val, y_val), (x_test, y_test) = \ mnist_as_np_arrays(args.train_size, args.val_size, args.test_size) train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) train_dataset = train_dataset.shuffle(len(x_train), shuffle_seed) train_dataset = train_dataset.batch(args.batch_size) train_dataset = train_dataset.prefetch(
def get_tnt_model_compiled(model, parallel_strategy, optimizer): tnt_model = tnt.Model(model, parallel_strategy) tnt_model.compile(optimizer=optimizer, **get_compile_params()) return tnt_model
def test_run_eagerly(self): tnt_model = tnt.Model(mnist.lenet5_model_generator()) assert tnt_model.run_eagerly == False
def test_metrics_names(self): tnt_model = tnt.Model(mnist.lenet5_model_generator()) assert tnt_model.metrics_names == []