Ejemplo n.º 1
0
 def setUpClass(cls):
     super().setUpClass()
     cls.strategy = tf.distribute.experimental.ParameterServerStrategy(
         multi_worker_testing_utils.make_parameter_server_cluster(3, 2),
         variable_partitioner=tf.distribute.experimental.partitioners.
         FixedShardsPartitioner(2),
     )
    def _model_compile(self,
                       strategy,
                       steps_per_execution=1,
                       run_eagerly=False,
                       with_normalization_layer=False,
                       use_lookup_layer=False):
        class ResultAssertingCallback(callbacks_lib.Callback):
            """A callback that asserts the result of the tests."""
            def __init__(self):
                self._prev_epoch = -1

            def on_epoch_end(self, epoch, logs=None):
                logging.info("testModelFit: epoch=%r, logs=%r", epoch, logs)
                if epoch <= self._prev_epoch:
                    raise RuntimeError(
                        "Epoch is supposed to be larger than previous.")
                self._prev_epoch = epoch
                is_loss_float = (logs.get("loss", None) is not None
                                 and isinstance(logs["loss"],
                                                (float, np.floating)))
                if not is_loss_float:
                    raise RuntimeError(
                        "loss is supposed to be in the logs and float.")

            def on_train_end(self, logs=None):
                if self._prev_epoch != 9:
                    raise RuntimeError("Unexpected last epoch: {}".format(
                        self._prev_epoch))

        # TODO(b/182193218): Use ParameterServerStrategy as a proper strategy
        # combination.
        if strategy == "ParameterServerStrategy":
            gpu_devices = tf.config.list_physical_devices("GPU")
            if len(gpu_devices) > 1:
                self.skipTest("b/178452835: Multi-GPUs not supported in "
                              "ParameterServerStrategy.")
            strategy = tf.distribute.experimental.ParameterServerStrategy(
                multi_worker_testing_utils.make_parameter_server_cluster(3, 2),
                variable_partitioner=tf.distribute.experimental.partitioners.
                FixedShardsPartitioner(2))

        with strategy.scope():
            model = sequential.Sequential([core_layers.Dense(10)])
            if with_normalization_layer:
                norm = keras.layers.BatchNormalization(axis=-1,
                                                       input_shape=(4, 4, 3),
                                                       momentum=0.8)
                model.add(norm)
            model.add(core_layers.Dense(1, activation="sigmoid"))
            self._metric = keras.metrics.Accuracy()

        model.compile(gradient_descent.SGD(),
                      loss="binary_crossentropy",
                      metrics=[self._metric],
                      steps_per_execution=steps_per_execution,
                      run_eagerly=run_eagerly)
        return model, [ResultAssertingCallback()]
Ejemplo n.º 3
0
    def test_slot_variable_checkpoint_load_with_diff_shards(self):

        with self.strategy.scope():
            # Set a name so the ShardedVariable is well-named for slot var
            # keying
            var = tf.Variable([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="test")

        opt = keras.optimizers.optimizer_v2.adam.Adam()

        # Run once to trigger apply_gradients to populate optimizer slot
        # variables.
        def train_step():
            with tf.GradientTape() as tape:
                loss = sum(var)
            opt.minimize(loss, var.variables, tape=tape)

        self.strategy.run(train_step)

        # Check that we can call get_slot using each slot, before and after
        # Checkpointing, and get the same results
        pre_ckpt_slots = []
        for slot in opt.get_slot_names():
            pre_ckpt_slots.extend(
                tf.concat(list(opt.get_slot(var, slot)), axis=0).numpy())

        ckpt = tf.train.Checkpoint(var=var, opt=opt)
        saved_dir = self.get_temp_dir()
        ckpt_prefix = f"{saved_dir}/ckpt"
        ckpt.save(ckpt_prefix)

        # Create new strategy with different number of shards
        strategy2 = tf.distribute.experimental.ParameterServerStrategy(
            multi_worker_testing_utils.make_parameter_server_cluster(3, 2),
            variable_partitioner=tf.distribute.experimental.partitioners.
            FixedShardsPartitioner(  # noqa: E501
                3),
        )

        # Create new variable with different values, to be overwritten by ckpt.
        with strategy2.scope():
            var = tf.Variable([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], name="test")

        opt = keras.optimizers.optimizer_v2.adam.Adam()
        # Run once to trigger apply_gradients to populate optimizer slot
        # variables.
        strategy2.run(train_step)

        new_ckpt = tf.train.Checkpoint(var=var, opt=opt)
        new_ckpt.restore(tf.train.latest_checkpoint(saved_dir))
        post_ckpt_slots = []
        for slot in new_ckpt.opt.get_slot_names():
            post_ckpt_slots.extend(
                tf.concat(list(new_ckpt.opt.get_slot(var, slot)),
                          axis=0).numpy())
        self.assertAllClose(pre_ckpt_slots, post_ckpt_slots)
Ejemplo n.º 4
0
    def test_saved_model_min_size_partitioner(self):

        # set min_shard_bytes such that Dense kernel is split into 2 and bias
        # into 1
        partitioner = (
            tf.distribute.experimental.partitioners.MinSizePartitioner(
                min_shard_bytes=(6 * 6 * 4) // 2, max_shards=2))

        cluster_resolver = (
            multi_worker_testing_utils.make_parameter_server_cluster(3, 2))
        strategy = tf.distribute.experimental.ParameterServerStrategy(
            cluster_resolver, variable_partitioner=partitioner)

        def create_dense_model():
            inputs = keras.layers.Input(shape=(6, ))
            outputs = keras.layers.Dense(6)(inputs)
            model = keras.Model(inputs, outputs)
            model.compile(optimizer="adam", loss="mean_squared_error")
            return model

        x = tf.cast(tf.expand_dims(tf.range(6), 0), tf.float32)
        with strategy.scope():
            model = create_dense_model()
            expect = model(x)

        # 2 kernel variables, 1 bias
        self.assertLen(model.variables, 3)

        saved_dir = self.get_temp_dir()
        model.save(saved_dir)

        # set min_shard_bytes such that Dense kernel is split into 3 and bias
        # into 1
        partitioner2 = (
            tf.distribute.experimental.partitioners.MinSizePartitioner(
                min_shard_bytes=(6 * 6 * 4) // 3, max_shards=3))
        strategy2 = tf.distribute.experimental.ParameterServerStrategy(
            cluster_resolver, variable_partitioner=partitioner2)

        with strategy2.scope():
            loaded_model = keras.models.load_model(saved_dir)
            got = loaded_model(x)

            self.assertAllClose(got, expect)
            # 3 kernel variables, 1 bias
            self.assertLen(loaded_model.variables, 4)
def make_coordinator(num_workers, num_ps, variable_partitioner=None):
    return tf.distribute.experimental.coordinator.ClusterCoordinator(
        tf.distribute.experimental.ParameterServerStrategy(
            multi_worker_testing_utils.make_parameter_server_cluster(
                num_workers, num_ps),
            variable_partitioner=variable_partitioner))
 def setUpClass(cls):
   super(KPLCreatedInDatasetsFromFunctionTest, cls).setUpClass()
   cls.coordinator = tf.distribute.experimental.coordinator.ClusterCoordinator(
       tf.distribute.experimental.ParameterServerStrategy(
           multi_worker_testing_utils.make_parameter_server_cluster(3, 2)))
Ejemplo n.º 7
0
    def test_saved_model_combined(self, shard_config, model_type):
        """Test saving and loading models with various fixed numbers of shards.

        Args:
          shard_config: The number of shards to use per variable before and
            after loading. For example, [1, 3] means to create and save the
            model with 1 shard (i.e., no variable partitioning), and load it
            into 3 shards per variable.
          model_type: Either 'dense' or 'embedding', which simple model to test.
        """
        def create_embedding_model():
            inputs = keras.layers.Input(shape=(6, ))
            embedding = keras.layers.Embedding(output_dim=2, input_dim=6)
            outputs = embedding(inputs)
            model = keras.Model(inputs, outputs)
            model.compile(optimizer="adam", loss="mean_squared_error")
            return model

        def create_dense_model():
            inputs = keras.layers.Input(shape=(6, ))
            outputs = keras.layers.Dense(6)(inputs)
            model = keras.Model(inputs, outputs)
            model.compile(optimizer="adam", loss="mean_squared_error")
            return model

        # Maybe create new strategy with different number of shards
        if shard_config[0] > 2:
            strategy = tf.distribute.experimental.ParameterServerStrategy(
                multi_worker_testing_utils.make_parameter_server_cluster(3, 3),
                variable_partitioner=tf.distribute.experimental.partitioners.
                FixedShardsPartitioner(  # noqa: E501
                    shard_config[0]),
            )
        elif shard_config[0] == 2:
            strategy = self.strategy
        else:
            # Just one shard, so use default strategy
            strategy = tf.distribute.get_strategy()

        x = tf.cast(tf.expand_dims(tf.range(6), 0), tf.float32)
        with strategy.scope():
            model = (create_dense_model()
                     if model_type == "dense" else create_embedding_model())
            expect = model(x)

        # Dense layers have two variables (kernel and bias), embedding layers
        # have 1
        n_expected_variables = shard_config[0] * (2 if model_type == "dense"
                                                  else 1)
        self.assertLen(model.variables, n_expected_variables)
        model_weights = [v.numpy() for v in model.variables]

        saved_dir = self.get_temp_dir()
        model.save(saved_dir)

        if shard_config[1] > 2:
            strategy2 = tf.distribute.experimental.ParameterServerStrategy(
                multi_worker_testing_utils.make_parameter_server_cluster(3, 3),
                variable_partitioner=tf.distribute.experimental.partitioners.
                FixedShardsPartitioner(  # noqa: E501
                    shard_config[1]),
            )
        elif shard_config[1] == 2:
            strategy2 = self.strategy
        else:
            # Just one shard, so use default strategy
            strategy2 = tf.distribute.get_strategy()

        with strategy2.scope():
            loaded_model = keras.models.load_model(saved_dir)
            got = loaded_model(x)

            self.assertAllClose(got, expect)
            n_expected_variables = shard_config[1] * (2 if model_type
                                                      == "dense" else 1)
            self.assertLen(loaded_model.variables, n_expected_variables)
            loaded_model_weights = [v.numpy() for v in loaded_model.variables]
            self.assertAllClose(
                np.concatenate([w.flatten() for w in model_weights]),
                np.concatenate([w.flatten() for w in loaded_model_weights]),
            )