Exemple #1
0
 def call(self, row_offsets, value_tensors, nnz_array, training=True):
     # forward propagtion of embedding layer
     return hugectr_tf_ops.fprop_v3(embedding_name=self.embedding_name,
                                    row_offsets=row_offsets,
                                    value_tensors=value_tensors,
                                    nnz_array=nnz_array,
                                    bp_trigger=self.bp_trigger,
                                    is_training=training,
                                    output_shape=[
                                        self.batch_size, self.slot_num,
                                        self.embedding_vec_size
                                    ])
Exemple #2
0
 def call(self,
          row_offsets,
          value_tensors,
          nnz_array,
          output_shape,
          training=False):
     return hugectr_tf_ops.fprop_v3(embedding_name=self.name_,
                                    row_offsets=row_offsets,
                                    value_tensors=value_tensors,
                                    nnz_array=nnz_array,
                                    bp_trigger=self.bp_trigger,
                                    is_training=training,
                                    output_shape=output_shape)
Exemple #3
0
    def _fprop_v3_VS_tf():
        print("[INFO]: Testing fprop_v3 vs tf...")
        if vocabulary_size < slot_num:
            raise RuntimeError("vocabulary_size must > slot.")
        with tf.GradientTape(persistent=True) as tape:
            # initial embedding table
            init_value = np.float32(
                np.random.normal(loc=0,
                                 scale=1,
                                 size=(vocabulary_size, embedding_vec_size)))
            # input keys
            # TODO: Keys in different slots should be unique.
            input_keys = np.ones(shape=(batch_size, slot_num, max_nnz),
                                 dtype=np.int64) * -1
            each_slot = vocabulary_size // slot_num
            nnz_0_num = 0
            for batch_id in range(batch_size):
                for slot_id in range(slot_num):
                    nnz = np.random.randint(
                        low=nnz_0_num, high=max_nnz + 1,
                        size=1)[0]  # how many keys in this slot
                    if nnz == 0:
                        nnz_0_num = 1
                    if (embedding_type == 'distributed'):
                        keys = np.random.randint(low=slot_id * each_slot,
                                                 high=(slot_id + 1) *
                                                 each_slot,
                                                 size=nnz)
                    elif (embedding_type == "localized"):
                        keys = []
                        while len(keys) < nnz:
                            key = np.random.randint(low=slot_id * each_slot,
                                                    high=(slot_id + 1) *
                                                    each_slot,
                                                    size=1)
                            if key % slot_num == slot_id:
                                keys.append(key)

                    input_keys[batch_id, slot_id, 0:nnz] = keys

            # hugectr ops
            hugectr_tf_ops.init(visiable_gpus=gpus,
                                key_type='int64',
                                value_type='float',
                                batch_size=batch_size,
                                batch_size_eval=len(gpus))
            embedding_name = hugectr_tf_ops.create_embedding(
                init_value=init_value,
                opt_hparams=[0.1, 0.9, 0.99, 1e-5],
                name_='hugectr_embedding',
                max_vocabulary_size_per_gpu=(vocabulary_size // len(gpus)) * 2
                + 1,
                slot_num=slot_num,
                embedding_vec_size=embedding_vec_size,
                max_feature_num=slot_num * max_nnz,
                embedding_type=embedding_type,
                max_nnz=max_nnz,
                update_type='Global')

            # use CreateDataset to do preprocessing
            dataset_utils = CreateDataset(dataset_names=None,
                                          feature_desc=None,
                                          batch_size=batch_size,
                                          n_epochs=1,
                                          slot_num=slot_num,
                                          max_nnz=max_nnz,
                                          convert_to_csr=None,
                                          gpu_count=len(gpus),
                                          embedding_type=embedding_type,
                                          get_row_indices=None)

            if ("distributed" == embedding_type):
                row_offsets, value_tensor, nnz_array = dataset_utils._distribute_keys_for_distributed(
                    input_keys)
            elif ("localized" == embedding_type):
                row_offsets, value_tensor, nnz_array = dataset_utils._distribute_keys_for_localized(
                    input_keys)
            else:
                raise RuntimeError("Not supported embedding_type %s" %
                                   embedding_type)

            bp_trigger = tf.Variable(initial_value=1.0,
                                     trainable=True,
                                     dtype=tf.float32)

            hugectr_forward = hugectr_tf_ops.fprop_v3(
                embedding_name=embedding_name,
                row_offsets=row_offsets,
                value_tensors=value_tensor,
                nnz_array=nnz_array,
                bp_trigger=bp_trigger,
                is_training=True,
                output_shape=[batch_size, slot_num, max_nnz])

            # print("hugectr_results=\n", hugectr_forward)

            # tf ops
            reshape_input_keys = np.reshape(input_keys, [-1, max_nnz])
            tf_indices = tf.where(reshape_input_keys != -1)
            tf_values = tf.gather_nd(reshape_input_keys, tf_indices)
            sparse_tensor = tf.sparse.SparseTensor(tf_indices, tf_values,
                                                   reshape_input_keys.shape)

            # FIXME: if there are too more nnz=0 slots, tf.nn.embedding_lookup_sparse may get wrong results?
            tf_embedding_layer = OriginalEmbedding(
                vocabulary_size=vocabulary_size,
                embedding_vec_size=embedding_vec_size,
                initializer=init_value,
                combiner='sum',
                gpus=gpus)

            tf_forward = tf_embedding_layer(
                sparse_tensor,
                output_shape=[batch_size, slot_num, embedding_vec_size])
            # print("tf_results=\n", tf_forward)

            # compare first forward result
            try:
                tf.debugging.assert_near(hugectr_forward, tf_forward)
            except tf.errors.InvalidArgumentError as error:
                raise error

            print(
                "[INFO]: The results from HugeCTR and tf in the first forward propagation are the same."
            )

        # backward
        hugectr_grads = tape.gradient(hugectr_forward, bp_trigger)

        tf_opt = tf.keras.optimizers.Adam(learning_rate=0.1,
                                          beta_1=0.9,
                                          beta_2=0.99,
                                          epsilon=1e-5)
        tf_grads = tape.gradient(tf_forward,
                                 tf_embedding_layer.trainable_weights)
        tf_opt.apply_gradients(
            zip(tf_grads, tf_embedding_layer.trainable_weights))

        # compare second forward result
        hugectr_forward_2 = hugectr_tf_ops.fprop_v3(
            embedding_name=embedding_name,
            row_offsets=row_offsets,
            value_tensors=value_tensor,
            nnz_array=nnz_array,
            bp_trigger=bp_trigger,
            is_training=True,
            output_shape=[batch_size, slot_num, max_nnz])

        tf_forward_2 = tf_embedding_layer(
            sparse_tensor,
            output_shape=[batch_size, slot_num, embedding_vec_size])

        # print("hugectr 2:\n", hugectr_forward_2)
        # print("tf 2:\n", tf_forward_2)
        try:
            tf.debugging.assert_near(hugectr_forward_2,
                                     tf_forward_2,
                                     rtol=1e-4,
                                     atol=1e-5)
        except tf.errors.InvalidArgumentError as error:
            raise error

        print(
            "[INFO]: The results from HugeCTR and tf in the second forward propagation are the same."
        )
        hugectr_tf_ops.reset()
Exemple #4
0
def tf_distribute_keys_fprop_v3(embedding_type):
    with tf.GradientTape() as tape:
        with tf.device("/gpu:0"):

            vocabulary_size = 8
            slot_num = 3
            embedding_vec_size = 4

            init_value = np.float32([
                i for i in range(1, vocabulary_size * embedding_vec_size + 1)
            ]).reshape(vocabulary_size, embedding_vec_size)
            # init_value = False
            # print(init_value)

            hugectr_tf_ops.init(visiable_gpus=[0, 1, 3, 4],
                                seed=123,
                                key_type='int64',
                                value_type='float',
                                batch_size=4,
                                batch_size_eval=4)
            embedding_name = hugectr_tf_ops.create_embedding(
                init_value=init_value,
                opt_hparams=[1.0, 0.9, 0.99, 1e-3],
                name_='test_embedding',
                max_vocabulary_size_per_gpu=1737710,
                slot_num=slot_num,
                embedding_vec_size=embedding_vec_size,
                max_feature_num=4,
                embedding_type=embedding_type,
                max_nnz=2)

            keys = np.array(
                [[[0, -1], [1, -1], [2, 6]], [[0, -1], [1, -1], [-1, -1]],
                 [[0, -1], [1, -1], [6, -1]], [[0, -1], [1, -1], [2, -1]]],
                dtype=np.int64)

            row_offsets, value_tensors, nnz_array = _distribute_kyes(
                tf.convert_to_tensor(keys),
                gpu_count=4,
                embedding_type=embedding_type)
            print("row_ptrs", row_offsets)
            print("\nvalues", value_tensors)
            print("\n", nnz_array)

            row_offsets, value_tensors, nnz_array = _distribute_kyes(
                tf.convert_to_tensor(keys),
                gpu_count=4,
                embedding_type=embedding_type)
            print("\nrow_ptrs", row_offsets)
            print("\nvalues", value_tensors)
            print("\n", nnz_array)
            # print("\n", _distribute_kyes.pretty_printed_concrete_signatures(), "\n")

            bp_trigger = tf.Variable(
                initial_value=[1.0, 2.0],
                trainable=True,
                dtype=tf.float32,
                name='embedding_plugin_bprop_trigger')  # must be trainable

            forward_result = hugectr_tf_ops.fprop_v3(
                embedding_name=embedding_name,
                row_offsets=row_offsets,
                nnz_array=nnz_array,
                value_tensors=value_tensors,
                is_training=True,
                bp_trigger=bp_trigger,
                output_shape=[4, slot_num, embedding_vec_size])
            print("first step: \n", forward_result)

            grads = tape.gradient(forward_result, bp_trigger)

            forward_result = hugectr_tf_ops.fprop_v3(
                embedding_name=embedding_name,
                row_offsets=row_offsets,
                nnz_array=nnz_array,
                value_tensors=value_tensors,
                is_training=False,
                bp_trigger=bp_trigger,
                output_shape=[4, slot_num, embedding_vec_size])
            print("second step: \n", forward_result)