def call(self, row_offsets, value_tensors, nnz_array, training=True): # forward propagtion of embedding layer return hugectr_tf_ops.fprop_v3(embedding_name=self.embedding_name, row_offsets=row_offsets, value_tensors=value_tensors, nnz_array=nnz_array, bp_trigger=self.bp_trigger, is_training=training, output_shape=[ self.batch_size, self.slot_num, self.embedding_vec_size ])
def call(self, row_offsets, value_tensors, nnz_array, output_shape, training=False): return hugectr_tf_ops.fprop_v3(embedding_name=self.name_, row_offsets=row_offsets, value_tensors=value_tensors, nnz_array=nnz_array, bp_trigger=self.bp_trigger, is_training=training, output_shape=output_shape)
def _fprop_v3_VS_tf(): print("[INFO]: Testing fprop_v3 vs tf...") if vocabulary_size < slot_num: raise RuntimeError("vocabulary_size must > slot.") with tf.GradientTape(persistent=True) as tape: # initial embedding table init_value = np.float32( np.random.normal(loc=0, scale=1, size=(vocabulary_size, embedding_vec_size))) # input keys # TODO: Keys in different slots should be unique. input_keys = np.ones(shape=(batch_size, slot_num, max_nnz), dtype=np.int64) * -1 each_slot = vocabulary_size // slot_num nnz_0_num = 0 for batch_id in range(batch_size): for slot_id in range(slot_num): nnz = np.random.randint( low=nnz_0_num, high=max_nnz + 1, size=1)[0] # how many keys in this slot if nnz == 0: nnz_0_num = 1 if (embedding_type == 'distributed'): keys = np.random.randint(low=slot_id * each_slot, high=(slot_id + 1) * each_slot, size=nnz) elif (embedding_type == "localized"): keys = [] while len(keys) < nnz: key = np.random.randint(low=slot_id * each_slot, high=(slot_id + 1) * each_slot, size=1) if key % slot_num == slot_id: keys.append(key) input_keys[batch_id, slot_id, 0:nnz] = keys # hugectr ops hugectr_tf_ops.init(visiable_gpus=gpus, key_type='int64', value_type='float', batch_size=batch_size, batch_size_eval=len(gpus)) embedding_name = hugectr_tf_ops.create_embedding( init_value=init_value, opt_hparams=[0.1, 0.9, 0.99, 1e-5], name_='hugectr_embedding', max_vocabulary_size_per_gpu=(vocabulary_size // len(gpus)) * 2 + 1, slot_num=slot_num, embedding_vec_size=embedding_vec_size, max_feature_num=slot_num * max_nnz, embedding_type=embedding_type, max_nnz=max_nnz, update_type='Global') # use CreateDataset to do preprocessing dataset_utils = CreateDataset(dataset_names=None, feature_desc=None, batch_size=batch_size, n_epochs=1, slot_num=slot_num, max_nnz=max_nnz, convert_to_csr=None, gpu_count=len(gpus), embedding_type=embedding_type, get_row_indices=None) if ("distributed" == embedding_type): row_offsets, value_tensor, nnz_array = dataset_utils._distribute_keys_for_distributed( input_keys) elif ("localized" == embedding_type): row_offsets, value_tensor, nnz_array = dataset_utils._distribute_keys_for_localized( input_keys) else: raise RuntimeError("Not supported embedding_type %s" % embedding_type) bp_trigger = tf.Variable(initial_value=1.0, trainable=True, dtype=tf.float32) hugectr_forward = hugectr_tf_ops.fprop_v3( embedding_name=embedding_name, row_offsets=row_offsets, value_tensors=value_tensor, nnz_array=nnz_array, bp_trigger=bp_trigger, is_training=True, output_shape=[batch_size, slot_num, max_nnz]) # print("hugectr_results=\n", hugectr_forward) # tf ops reshape_input_keys = np.reshape(input_keys, [-1, max_nnz]) tf_indices = tf.where(reshape_input_keys != -1) tf_values = tf.gather_nd(reshape_input_keys, tf_indices) sparse_tensor = tf.sparse.SparseTensor(tf_indices, tf_values, reshape_input_keys.shape) # FIXME: if there are too more nnz=0 slots, tf.nn.embedding_lookup_sparse may get wrong results? tf_embedding_layer = OriginalEmbedding( vocabulary_size=vocabulary_size, embedding_vec_size=embedding_vec_size, initializer=init_value, combiner='sum', gpus=gpus) tf_forward = tf_embedding_layer( sparse_tensor, output_shape=[batch_size, slot_num, embedding_vec_size]) # print("tf_results=\n", tf_forward) # compare first forward result try: tf.debugging.assert_near(hugectr_forward, tf_forward) except tf.errors.InvalidArgumentError as error: raise error print( "[INFO]: The results from HugeCTR and tf in the first forward propagation are the same." ) # backward hugectr_grads = tape.gradient(hugectr_forward, bp_trigger) tf_opt = tf.keras.optimizers.Adam(learning_rate=0.1, beta_1=0.9, beta_2=0.99, epsilon=1e-5) tf_grads = tape.gradient(tf_forward, tf_embedding_layer.trainable_weights) tf_opt.apply_gradients( zip(tf_grads, tf_embedding_layer.trainable_weights)) # compare second forward result hugectr_forward_2 = hugectr_tf_ops.fprop_v3( embedding_name=embedding_name, row_offsets=row_offsets, value_tensors=value_tensor, nnz_array=nnz_array, bp_trigger=bp_trigger, is_training=True, output_shape=[batch_size, slot_num, max_nnz]) tf_forward_2 = tf_embedding_layer( sparse_tensor, output_shape=[batch_size, slot_num, embedding_vec_size]) # print("hugectr 2:\n", hugectr_forward_2) # print("tf 2:\n", tf_forward_2) try: tf.debugging.assert_near(hugectr_forward_2, tf_forward_2, rtol=1e-4, atol=1e-5) except tf.errors.InvalidArgumentError as error: raise error print( "[INFO]: The results from HugeCTR and tf in the second forward propagation are the same." ) hugectr_tf_ops.reset()
def tf_distribute_keys_fprop_v3(embedding_type): with tf.GradientTape() as tape: with tf.device("/gpu:0"): vocabulary_size = 8 slot_num = 3 embedding_vec_size = 4 init_value = np.float32([ i for i in range(1, vocabulary_size * embedding_vec_size + 1) ]).reshape(vocabulary_size, embedding_vec_size) # init_value = False # print(init_value) hugectr_tf_ops.init(visiable_gpus=[0, 1, 3, 4], seed=123, key_type='int64', value_type='float', batch_size=4, batch_size_eval=4) embedding_name = hugectr_tf_ops.create_embedding( init_value=init_value, opt_hparams=[1.0, 0.9, 0.99, 1e-3], name_='test_embedding', max_vocabulary_size_per_gpu=1737710, slot_num=slot_num, embedding_vec_size=embedding_vec_size, max_feature_num=4, embedding_type=embedding_type, max_nnz=2) keys = np.array( [[[0, -1], [1, -1], [2, 6]], [[0, -1], [1, -1], [-1, -1]], [[0, -1], [1, -1], [6, -1]], [[0, -1], [1, -1], [2, -1]]], dtype=np.int64) row_offsets, value_tensors, nnz_array = _distribute_kyes( tf.convert_to_tensor(keys), gpu_count=4, embedding_type=embedding_type) print("row_ptrs", row_offsets) print("\nvalues", value_tensors) print("\n", nnz_array) row_offsets, value_tensors, nnz_array = _distribute_kyes( tf.convert_to_tensor(keys), gpu_count=4, embedding_type=embedding_type) print("\nrow_ptrs", row_offsets) print("\nvalues", value_tensors) print("\n", nnz_array) # print("\n", _distribute_kyes.pretty_printed_concrete_signatures(), "\n") bp_trigger = tf.Variable( initial_value=[1.0, 2.0], trainable=True, dtype=tf.float32, name='embedding_plugin_bprop_trigger') # must be trainable forward_result = hugectr_tf_ops.fprop_v3( embedding_name=embedding_name, row_offsets=row_offsets, nnz_array=nnz_array, value_tensors=value_tensors, is_training=True, bp_trigger=bp_trigger, output_shape=[4, slot_num, embedding_vec_size]) print("first step: \n", forward_result) grads = tape.gradient(forward_result, bp_trigger) forward_result = hugectr_tf_ops.fprop_v3( embedding_name=embedding_name, row_offsets=row_offsets, nnz_array=nnz_array, value_tensors=value_tensors, is_training=False, bp_trigger=bp_trigger, output_shape=[4, slot_num, embedding_vec_size]) print("second step: \n", forward_result)