Esempio n. 1
0
    def total_train_step(row_offsets, value_tensors, nnz_array, labels):
        with tf.GradientTape() as tape:
            # do embedding fprop
            embedding_results = sparse_model(row_offsets, value_tensors,
                                             nnz_array)

            # convert to PerReplica
            dense_inputs = tf.split(embedding_results,
                                    num_or_size_splits=gpu_count)
            dense_inputs = PerReplica(dense_inputs)
            labels = tf.expand_dims(labels, axis=1)
            labels = tf.split(labels, num_or_size_splits=gpu_count)
            labels = PerReplica(labels)

            replica_loss, input_grads = strategy.run(dense_train_step,
                                                     args=(dense_inputs,
                                                           labels))

            # gather all grads from dense replicas
            all_grads = tf.concat(input_grads.values, axis=0)

            # do embedding backward
            embedding_grads = tape.gradient(embedding_results,
                                            sparse_model.trainable_weights,
                                            output_gradients=all_grads)
            sparse_opt.apply_gradients(
                zip(embedding_grads, sparse_model.trainable_weights))
            return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                   replica_loss,
                                   axis=None)
        def total_train_step(indices, values, dense_shape, labels):
            with tf.GradientTape() as tape:
                indices, emb_ctx = nvtx_tf.ops.start(indices,
                                                     message='emb_fprop',
                                                     domain_name='forward')

                embedding_result = sparse_model(indices, values, dense_shape, 
                                                output_shape = [-1, 26, 32] # [batch_size, slot_num, embedding_vec_size] 
                                                )
                
                embedding_result = nvtx_tf.ops.end(embedding_result, emb_ctx)

                embedding_result, dense_ctx = nvtx_tf.ops.start(embedding_result, 
                                                     message='dense_fprop',
                                                     domain_name='forward')

                labels = tf.expand_dims(labels, axis=1)
                dense_inputs = tf.split(embedding_result, num_or_size_splits=len(gpus))
                dense_labels = tf.split(labels, num_or_size_splits=len(gpus))
                dense_inputs_replicas = PerReplica(dense_inputs)
                dense_labels_replicas = PerReplica(dense_labels)

                dense_losses, input_grads = distribute_strategy.run(dense_train_step,
                                                                    args=(dense_inputs_replicas, dense_labels_replicas))

                all_grads = tf.concat(input_grads.values, axis=0)

                all_grads = nvtx_tf.ops.end(all_grads, dense_ctx)

            embedding_grads = tape.gradient(embedding_result, sparse_model.trainable_weights, output_gradients=all_grads)
            sparse_opt.apply_gradients(zip(embedding_grads, sparse_model.trainable_weights))
            return distribute_strategy.reduce(tf.distribute.ReduceOp.SUM, dense_losses, axis=None) 
Esempio n. 3
0
def main():
    # create MirroredStrategy with specified GPUs.
    strategy = tf.distribute.MirroredStrategy(
        devices=["/GPU:" + str(i) for i in range(gpu_count)])

    # create model instance inner the scope of MirroredStrategy
    with strategy.scope():
        model = PluginSparseModel(...)

        # define optimizer for the variables in DNN model except embedding layer
        opt = tf.keras.optimizers.SGD()

    # define loss function for each replica
    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=False, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss_value = loss_fn(labels, logits)
        return tf.nn.compute_average_loss(loss_value,
                                          global_batch_size=batch_size)

    # define train step for one iteration in a replica
    @tf.function
    def _train_step(each_replica, label):
        with tf.GradientTape() as tape:
            label = tf.expand_dims(label, axis=1)
            logit = model(each_replica)
            replica_loss = _replica_loss(label, logit)

        replica_grads = tape.gradient(replica_loss, model.trainable_weights)
        opt.apply_gradients(zip(replica_grads, model.trainable_weights))
        return replica_loss

    # create a tf.data.Dataset to read data
    dataset = ...

    # training loop
    for step, (row_indices, values, labels) in enumerate(dataset):
        # use this API to broadcast input data to each GPU
        to_each_replicas = hugectr_tf_ops_v2.broadcast_then_convert_to_csr(
            model.get_embedding_name,
            row_indices,
            values,
            T=[tf.int32] * gpu_count)
        to_each_replicas = PerReplica(to_each_replicas)
        labels = tf.split(labels, num_or_size_splits=gpu_count)
        labels = PerReplica(labels)

        # each replica iteration
        replica_loss = strategy.run(_train_step,
                                    args=(to_each_replicas, labels))

        # loss reduction in all replicas
        total_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                     replica_loss,
                                     axis=None)
    def _get_step_inputs(batch, nb_instances):

        features, labels = batch

        if isinstance(labels, PerReplica):
            # need to make a `PerReplica` objects for ``nb_instances``
            nb_instances = PerReplica([nb_instances] * len(labels.values))

        step_inputs = (features, labels, nb_instances)

        return step_inputs
    def _get_step_inputs(batch):
        features, labels = batch
        if isinstance(labels, PerReplica):
            labels = tf.concat(labels.values, axis=0)
        nb_instances = tf.reduce_sum(tf.cast(labels != -100, dtype=tf.int32))

        if isinstance(labels, PerReplica):
            # need to make a `PerReplica` objects for ``nb_instances``
            nb_instances = PerReplica([nb_instances] * len(labels.values))
        step_inputs = (features, labels, nb_instances)
        return step_inputs
    def call(self):
        slot_num = 26
        max_nnz = 3
        embedding_vec_size = 16

        input_in_each_gpu = [
            np.random.normal(size=(self.global_batch_size * slot_num,
                                   embedding_vec_size)).astype(np.float32)
            for _ in range(8)
        ]
        input_in_each_gpu_ = PerReplica(input_in_each_gpu)
        print("[INFO] original inputs: \n", input_in_each_gpu_)

        target = tf.split(tf.reshape(
            tf.math.reduce_sum(input_in_each_gpu, axis=0),
            shape=[self.global_batch_size, slot_num, embedding_vec_size]),
                          num_or_size_splits=8)
        print("[INFO] target output: \n", target)

        @tf.function
        def _step(replica_input):
            replica_ctx = tf.distribute.get_replica_context()

            replica_output = sok_unit_test_lib.reduce_scatter_dispatcher(
                replica_ctx.replica_id_in_sync_group,
                replica_input,
                global_batch_size=self.global_batch_size,
                slot_num=slot_num,
                max_nnz=max_nnz)
            return replica_output

        outputs = self.strategy.run(_step, args=(input_in_each_gpu_, ))
        print("[INFO] replica output:\n", outputs)

        for i in range(len(input_in_each_gpu)):
            tf.debugging.assert_near(target[i],
                                     outputs.values[i],
                                     message="output %d not meet target.",
                                     atol=1e-5,
                                     rtol=1e-5)
Esempio n. 7
0
    def _v2_fprop_v1_test():
        print("[INFO]: Testing plugin_v2 fprop_experimental vs tf..")
        if vocabulary_size < slot_num:
            raise ValueError("vocabulary_size must > slot_num.")

        # generate initial values
        init_value, input_keys = generate_embedding_init_value_and_inputs()

        # -------------------------------- hugectr ops ------------------------------------ #
        class TestModel(tf.keras.models.Model):
            def __init__(self, init_value, name_, embedding_type,
                         optimizer_type, max_vocabulary_size_per_gpu,
                         opt_hparams, update_type, atomic_update, scaler,
                         slot_num, max_nnz, max_feature_num,
                         embedding_vec_size, combiner):
                super(TestModel, self).__init__()

                self.input_buffer_reset = True if "distributed" == embedding_type else False

                self.embedding_name = hugectr_tf_ops_v2.create_embedding(
                    init_value=init_value,
                    name_=name_,
                    embedding_type=embedding_type,
                    optimizer_type=optimizer_type,
                    max_vocabulary_size_per_gpu=max_vocabulary_size_per_gpu,
                    opt_hparams=opt_hparams,
                    update_type=update_type,
                    atomic_update=atomic_update,
                    scaler=scaler,
                    slot_num=slot_num,
                    max_nnz=max_nnz,
                    max_feature_num=max_feature_num,
                    embedding_vec_size=embedding_vec_size,
                    combiner=combiner)

            def build(self, _):
                self.bp_trigger = self.add_weight(name="bp_trigger",
                                                  shape=(1, ),
                                                  dtype=tf.float32,
                                                  trainable=True)

            @tf.function
            def call(self, row_offset, values, nnz, training=True):
                replica_ctx = tf.distribute.get_replica_context()
                result = hugectr_tf_ops_v2.fprop_experimental(
                    self.embedding_name,
                    replica_ctx.replica_id_in_sync_group,
                    row_offset,
                    values,
                    nnz,
                    self.bp_trigger,
                    input_buffer_reset=self.input_buffer_reset)
                return result

        hugectr_tf_ops_v2.init(visible_gpus=gpus,
                               seed=0,
                               key_type='int64',
                               value_type='float',
                               batch_size=batch_size,
                               batch_size_eval=len(gpus))

        strategy = tf.distribute.MirroredStrategy(
            devices=['/GPU:' + str(i) for i in gpus])
        with strategy.scope():
            hugectr_model = TestModel(
                init_value=init_value,
                name_='test_embedding',
                embedding_type=embedding_type,
                optimizer_type='Adam',
                max_vocabulary_size_per_gpu=(vocabulary_size // len(gpus)) * 2
                + 1,
                opt_hparams=[0.1, 0.9, 0.99, 1e-5],
                update_type='Global',
                atomic_update=True,
                scaler=1.0,
                slot_num=slot_num,
                max_nnz=max_nnz,
                max_feature_num=slot_num * max_nnz,
                embedding_vec_size=embedding_vec_size,
                combiner='sum')
            opt = tf.keras.optimizers.Adam(learning_rate=0.1,
                                           beta_1=0.9,
                                           beta_2=0.99,
                                           epsilon=1e-5)

        # preprocess inputs
        dataset_utils = CreateDataset(dataset_names=None,
                                      feature_desc=None,
                                      batch_size=batch_size,
                                      n_epochs=None,
                                      slot_num=slot_num,
                                      max_nnz=max_nnz,
                                      convert_to_csr=None,
                                      gpu_count=len(gpus),
                                      embedding_type=embedding_type,
                                      get_row_indices=None)
        if "distributed" == embedding_type:
            row_offsets, value_tensors, nnz_array = dataset_utils._distribute_keys_for_distributed(
                input_keys)
        elif "localized" == embedding_type:
            row_offsets, value_tensors, nnz_array = dataset_utils._distribute_keys_for_localized(
                input_keys)
        else:
            raise ValueError("Not supported embedding_type %s." %
                             embedding_type)

        # forward function
        @tf.function
        def hugectr_train_step(row_offset, values, nnz):
            with tf.GradientTape() as tape:
                forward_result = hugectr_model(row_offset, values, nnz)

            grads = tape.gradient(forward_result,
                                  hugectr_model.trainable_weights)
            opt.apply_gradients(zip(grads, hugectr_model.trainable_weights))
            return forward_result

        # -------------------------------- tf ops ------------------------------------------- #
        reshape_input_keys = np.reshape(input_keys, [-1, max_nnz])
        tf_indices = tf.where(reshape_input_keys != -1)
        tf_values = tf.gather_nd(reshape_input_keys, tf_indices)
        sparse_tensor = tf.sparse.SparseTensor(tf_indices, tf_values,
                                               reshape_input_keys.shape)

        tf_embedding_layer = OriginalEmbedding(
            vocabulary_size=vocabulary_size,
            embedding_vec_size=embedding_vec_size,
            initializer=init_value,
            combiner='sum',
            gpus=gpus)

        tf_opt = tf.keras.optimizers.Adam(learning_rate=0.1,
                                          beta_1=0.9,
                                          beta_2=0.99,
                                          epsilon=1e-5)

        @tf.function
        def tf_train_step(sparse_tensor):
            with tf.GradientTape() as tape:
                tf_forward = tf_embedding_layer(
                    sparse_tensor,
                    output_shape=[batch_size, slot_num, embedding_vec_size])

            grads = tape.gradient(tf_forward,
                                  tf_embedding_layer.trainable_weights)
            tf_opt.apply_gradients(
                zip(grads, tf_embedding_layer.trainable_weights))
            return tf_forward

        # ------------------ comparison ---------------------------------------------------- #
        for iteration in range(2):
            replica_row_offsets = PerReplica(row_offsets)
            replica_values = PerReplica(value_tensors)
            replica_nnz = PerReplica(nnz_array)

            hugectr_forward = strategy.run(hugectr_train_step,
                                           args=(replica_row_offsets,
                                                 replica_values, replica_nnz))
            if len(gpus) > 1:
                hugectr_forward = tf.concat(hugectr_forward.values, axis=0)

            tf_forward = tf_train_step(sparse_tensor)

            try:
                tf.debugging.assert_near(hugectr_forward,
                                         tf_forward,
                                         rtol=1e-4,
                                         atol=1e-5)
            except tf.errors.InvalidArgumentError as error:
                raise error
            else:
                print(
                    "[INFO]: The results from HugeCTR and tf in %d iteration are the same"
                    % (iteration + 1))

        # --------------------- release resources -------------------------------------- #
        hugectr_tf_ops_v2.reset()
    def call(self):
        rows_num_per_sample = 26
        max_nnz = 3

        all_inputs = np.random.randint(
            low=1,
            high=100,
            size=[self.global_batch_size * rows_num_per_sample, max_nnz])

        all_mask = np.random.randint(
            low=0,
            high=2,
            size=[self.global_batch_size * rows_num_per_sample, max_nnz])

        all_inputs *= all_mask
        print("[INFO] original dense all inputs:\n", all_inputs)

        all_valid_indices = tf.where(all_inputs != 0)
        all_valid_values = tf.gather_nd(all_inputs, all_valid_indices)

        all_inputs_sparse_tensor = tf.sparse.SparseTensor(
            values=all_valid_values,
            indices=all_valid_indices,
            dense_shape=all_inputs.shape)
        print("[INFO] original inputs sparse tensor:\n",
              all_inputs_sparse_tensor)

        sparse_tensors = tf.sparse.split(sp_input=all_inputs_sparse_tensor,
                                         num_split=8,
                                         axis=0)
        sparse_tensors = PerReplica(sparse_tensors)
        print("[INFO] to each replica sparse tensors:\n", sparse_tensors)

        target_values = all_inputs_sparse_tensor.values
        # target_indices = tf.concat([tf.transpose(sparse_tensor.indices, perm=[1, 0])[0]
        #                             for sparse_tensor in sparse_tensors.values],
        #                            axis=0)
        target_indices = tf.transpose(all_inputs_sparse_tensor.indices,
                                      perm=[1, 0])[0]
        target_num_elements = tf.concat([
            tf.shape(sparse_tensor.indices, out_type=tf.int64)[0]
            for sparse_tensor in sparse_tensors.values
        ],
                                        axis=0)
        target_total_valid_num = tf.size(target_values, out_type=tf.int64)
        print("[INFO] target_values: \n", target_values)
        print("[INFO] target_indcies: \n", target_indices)
        print("[INFO] target_num_elements: \n", target_num_elements)
        print("[INFO] target_total_valid_num: \n", target_total_valid_num)

        @tf.function
        def _step(sparse_tensor):
            if not isinstance(sparse_tensor, tf.sparse.SparseTensor):
                raise RuntimeError(
                    "sparse_tensor must be a tf.sparse.SparseTensor")

            values = sparse_tensor.values  # [num_of_valids,]
            indices = sparse_tensor.indices
            row_indices = tf.transpose(indices, perm=[1,
                                                      0])[0]  # [num_of_valids]

            replica_ctx = tf.distribute.get_replica_context()

            values_out, indices_out, num_elements, total_valid_num = sok_unit_test_lib.all_gather_dispatcher(
                replica_ctx.replica_id_in_sync_group,
                replica_ctx.num_replicas_in_sync,
                values,
                row_indices,
                global_batch_size=self.global_batch_size,
                rows_num_per_sample=rows_num_per_sample,
                max_nnz=max_nnz)
            return values_out, indices_out, num_elements, total_valid_num

        values_out, indices_out, num_elements, total_valid_num = self.strategy.run(
            _step, args=(sparse_tensors, ))
        print("[INFO]: after all gather dispatcher, values = \n", values_out)
        print("[INFO]: after all gather dispatcher, indices = \n", indices_out)
        print("[INFO]: after all gather dispatcher, num_elements = \n",
              num_elements)
        print("[INFO]: after all gather dispatcher, total_valid_num = \n",
              total_valid_num)

        for i in range(len(values_out.values)):
            tf.debugging.assert_equal(
                target_values,
                values_out.values[i][:target_values.shape[0]],
                message="values %d not meet target." % i)
            tf.debugging.assert_equal(
                target_indices,
                indices_out.values[i][:target_indices.shape[0]],
                message="indcies %d not meet target." % i)
            tf.debugging.assert_equal(
                target_num_elements,
                num_elements.values[i][:target_num_elements.shape[0]],
                message="num_elements %d not meet target." % i)
            tf.debugging.assert_equal(
                target_total_valid_num,
                total_valid_num.values[i],
                message="total_valid_num %d not meet target." % i)
def profile_plugin(embedding_type, gpu_count, vocabulary_size=1737710, 
                    slot_num=26, max_nnz=1, batch_size=65536,
                    fprop_version='v1'):
    file_name = "plugin_v2_" + embedding_type + "_" + str(gpu_count) + "_" + fprop_version
    dataset = plugin_reader(file_name, gpu_count, fprop_version)

    # # build model
    strategy = tf.distribute.MirroredStrategy(devices=["/GPU:" + str(i) for i in range(gpu_count)])
    with strategy.scope():
        if fprop_version == 'v1':
            model = PluginSparseModel(batch_size=batch_size, 
                                    gpus=[i for i in range(gpu_count)],
                                    init_value=False, name_='hugectr_embedding', 
                                    embedding_type=embedding_type, optimizer_type='Adam',
                                    max_vocabulary_size_per_gpu=(vocabulary_size // gpu_count) + 1,
                                    opt_hparams=[0.1, 0.9, 0.999, 1e-3],
                                    update_type='Local',
                                    atomic_update=True,
                                    scaler=1.0,
                                    slot_num=slot_num,
                                    max_nnz=max_nnz,
                                    max_feature_num=100,
                                    embedding_vec_size=32,
                                    combiner='sum',
                                    num_dense_layers=7,
                                    input_buffer_reset=False)
        elif fprop_version == 'v2':
            model = PluginSparseModelV2(batch_size=batch_size, 
                                    gpus=[i for i in range(gpu_count)],
                                    init_value=False, name_='hugectr_embedding', 
                                    embedding_type=embedding_type, optimizer_type='Adam',
                                    max_vocabulary_size_per_gpu=(vocabulary_size // gpu_count) + 1,
                                    opt_hparams=[0.1, 0.9, 0.999, 1e-3],
                                    update_type='Local',
                                    atomic_update=True,
                                    scaler=1.0,
                                    slot_num=slot_num,
                                    max_nnz=max_nnz,
                                    max_feature_num=100,
                                    embedding_vec_size=32,
                                    combiner='sum',
                                    num_dense_layers=7,
                                    input_buffer_reset=False)
        dense_opt = tf.keras.optimizers.SGD()

    loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss_v = loss_fn(labels, logits)
        return tf.nn.compute_average_loss(loss_v, global_batch_size=batch_size)

    if fprop_version == 'v1':
        @tf.function
        def _train_step(row_offset, values, nnz, label):
            with tf.GradientTape() as tape:
                label = tf.expand_dims(label, axis=1)
                logit = model(row_offset, values, nnz)
                replica_loss = _replica_loss(label, logit)

            replica_grads = tape.gradient(replica_loss, model.trainable_weights)
            dense_opt.apply_gradients(zip(replica_grads, model.trainable_weights))
            return replica_loss
    elif fprop_version == 'v2':
        @tf.function
        def _train_step(each_replica, label):
            with tf.GradientTape() as tape:
                label = tf.expand_dims(label, axis=1)
                logit = model(each_replica)
                replica_loss = _replica_loss(label, logit)

            replica_grads = tape.gradient(replica_loss, model.trainable_weights)
            dense_opt.apply_gradients(zip(replica_grads, model.trainable_weights))
            return replica_loss

    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
    dataset = dataset.with_options(options)

    input_options = tf.distribute.InputOptions(
        experimental_prefetch_to_device=True, # TODO: not working..
        experimental_replication_mode=tf.distribute.InputReplicationMode.PER_WORKER,
        experimental_place_dataset_on_device=False
    )

    if fprop_version == "v1":
        dataset = strategy.experimental_distribute_dataset(dataset, input_options)

        for step, (row_offsets, value_tensors, nnz_array, labels) in enumerate(dataset):
            step, step_ctx = nvtx_tf.ops.start(tf.convert_to_tensor(step, dtype=tf.int32),
                                                message='Iteration_' + str(step))

            replica_loss = strategy.run(_train_step, args=(row_offsets, value_tensors, nnz_array, labels))
            total_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, replica_loss, axis=None)

            tf.print("step:%d, loss:%.5f" %(step, total_loss))

            step = nvtx_tf.ops.end(step, step_ctx)

    elif fprop_version == 'v2':
        for step, (row_indices, values, labels) in enumerate(dataset):
            step, step_ctx = nvtx_tf.ops.start(tf.convert_to_tensor(step, dtype=tf.int32),
                                                message='Iteration_' + str(step))

            to_each_replicas = hugectr_tf_ops_v2.broadcast_then_convert_to_csr(
                        model.get_embedding_name, row_indices, values, T = [tf.int32] * gpu_count)
            to_each_replicas = PerReplica(to_each_replicas)
            labels = tf.split(labels, num_or_size_splits=gpu_count)
            labels = PerReplica(labels)

            replica_loss = strategy.run(_train_step, args=(to_each_replicas, labels))
            total_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, replica_loss, axis=None)

            tf.print("step:%d, loss:%.5f" %(step, total_loss))
            
            step = nvtx_tf.ops.end(step, step_ctx)