Beispiel #1
0
    def lookup_embedding(self, unique_ids):
        ids = unique_ids.numpy()
        self._check_id_valid(ids)
        if self._lookup_embedding_func:
            embedding_vectors = self._lookup_embedding_func(self._name, ids)
            return embedding_vectors

        keys = [Embedding.get_key([self._name, id]) for id in ids]
        (
            embedding_vectors,
            unknown_keys_index,
        ) = EmbeddingService.lookup_embedding(
            keys=keys,
            embedding_service_endpoint=self.embedding_service_endpoint,
        )

        if unknown_keys_index:
            # Initialize unknown_keys' embedding vectors and write into Redis.
            unknown_keys = [keys[index] for index in unknown_keys_index]
            initializer = tf.keras.initializers.get(
                self.embeddings_initializer)
            embedding_vector_init = [
                initializer(shape=[1, self.output_dim]).numpy()
                for _ in unknown_keys
            ]
            embedding_vector_init = np.concatenate(embedding_vector_init,
                                                   axis=0)
            EmbeddingService.update_embedding(
                keys=unknown_keys,
                embedding_vectors=embedding_vector_init,
                embedding_service_endpoint=self.embedding_service_endpoint,
                set_if_not_exist=True,
            )
            # Lookup unknown_keys' embedding vectors
            (
                embedding_vectors_new,
                unknown_keys_idx_new,
            ) = EmbeddingService.lookup_embedding(
                keys=unknown_keys,
                embedding_service_endpoint=self.embedding_service_endpoint,
            )
            if unknown_keys_idx_new:
                raise Exception("Update embedding vector: %s failed." % str(
                    [unknown_keys[index] for index in unknown_keys_idx_new]))
            for key_index, vector in zip(unknown_keys_index,
                                         embedding_vectors_new):
                embedding_vectors[key_index] = vector
        embedding_vectors = np.concatenate(embedding_vectors, axis=0)
        return embedding_vectors.reshape((len(keys), self.output_dim))
Beispiel #2
0
 def lookup_embedding(self,
                      ids,
                      layer_name,
                      initializer="uniform",
                      embedding_table_dim=128):
     keys = [Embedding.get_key([layer_name, id]) for id in ids]
     (
         embedding_vectors,
         unknown_keys_index,
     ) = EmbeddingService.lookup_embedding(
         keys=keys,
         embedding_service_endpoint=self._embedding_service_endpoint,
     )
     if unknown_keys_index:
         # Initialize unknown_keys' embedding vectors and write into Redis.
         unknown_keys = [keys[index] for index in unknown_keys_index]
         initializer = tf.keras.initializers.get(initializer)
         embedding_vector_init = [
             initializer(shape=[1, embedding_table_dim]).numpy()
             for _ in unknown_keys
         ]
         embedding_vector_init = np.concatenate(embedding_vector_init,
                                                axis=0)
         EmbeddingService.update_embedding(
             keys=unknown_keys,
             embedding_vectors=embedding_vector_init,
             embedding_service_endpoint=self._embedding_service_endpoint,
             set_if_not_exist=True,
         )
         # Lookup unknown_keys' embedding vectors
         (
             embedding_vectors_new,
             unknown_keys_idx_new,
         ) = EmbeddingService.lookup_embedding(
             keys=unknown_keys,
             embedding_service_endpoint=self._embedding_service_endpoint,
         )
         if unknown_keys_idx_new:
             raise Exception("Update embedding vector: %s failed." % str(
                 [unknown_keys[index] for index in unknown_keys_idx_new]))
         for key_index, vector in zip(unknown_keys_index,
                                      embedding_vectors_new):
             embedding_vectors[key_index] = vector
     embedding_vectors = np.concatenate(embedding_vectors, axis=0)
     return embedding_vectors.reshape((len(keys), embedding_table_dim))
Beispiel #3
0
    def _update_model(self):
        assert self._lock.locked()
        grad_var = []

        # (grad, var) pairs excluding keras Embedding layer and
        # ElasticDL Embedding layer
        for k in self._gradient_sum:
            self._gradient_sum[k] = self._gradient_sum[k] / self._grad_to_wait
            grad_var.append((self._gradient_sum[k], self._model[k]))

        # (grad, var) pair of Keras Embedding layer
        for k in self._gradient_sum_indexed:
            grad_var.append((self._gradient_sum_indexed[k], self._model[k]))

        # (grad, var) pair of ElasticDL Embedding layer
        edl_embedding_offset = len(grad_var)
        unique_ids_list = []
        if self._edl_embedding_gradients:
            for layer_name, grads in self._edl_embedding_gradients.items():
                unique_ids, idx = tf.unique(grads.indices)
                unique_ids_list.append(unique_ids)
                grads_idx_transformed = tf.IndexedSlices(grads.values, idx)
                keys = [
                    Embedding.get_key([layer_name, i])
                    for i in unique_ids.numpy()
                ]
                embeddings, unknown_keys = EmbeddingService.lookup_embedding(
                    embedding_service_endpoint=(
                        self._embedding_service_endpoint),
                    keys=keys,
                )
                if unknown_keys:
                    raise RuntimeError(
                        "Master reviced %d unknown embedding keys: %s ..." %
                        (len(unknown_keys), str(unknown_keys[0])))
                if not embeddings:
                    continue
                embeddings = np.concatenate(embeddings,
                                            axis=0).reshape(len(keys), -1)
                embedding_var = tf.Variable(embeddings)
                grad_var.append((grads_idx_transformed, embedding_var))

        # TODO: support optimizer with slots such as Adam, FTRL
        self._opt.apply_gradients(grad_var)

        # report updated embedding table to EmbeddingService
        self._update_edl_embedding_table(
            zip(
                self._edl_embedding_gradients.keys(),
                unique_ids_list,
                [v for g, v in grad_var[edl_embedding_offset:]],
            ))
        self._update_model_version()
        self._gradient_sum.clear()
        self._gradient_sum_indexed.clear()
        self._edl_embedding_gradients.clear()
        self._grad_n = 0
    def test_lookup_and_update_embedding(self):
        with tempfile.TemporaryDirectory() as temp_dir:
            embedding_endpoint = start_redis_instances(temp_dir)
            # start
            embedding_service = EmbeddingService(embedding_endpoint)
            embedding_endpoint = embedding_service._create_redis_cluster()
            # wait for cluster up-running
            time.sleep(1)
            origin_data = np.random.rand(100, 10).astype(np.float32)
            keys = ["test_%d" % i for i in range(origin_data.shape[0])]

            EmbeddingService.update_embedding(keys, origin_data,
                                              embedding_endpoint)
            lookup_data, unknown_keys_idx = EmbeddingService.lookup_embedding(
                keys, embedding_endpoint, parse_type=np.float32)
            self.assertTrue(len(unknown_keys_idx) == 0)
            output_length = len(keys)
            lookup_data = np.concatenate(lookup_data, axis=0)
            lookup_data = lookup_data.reshape((output_length, -1))
            self.assertTrue(np.equal(origin_data, lookup_data).all())

            # Test set_if_not_exist
            origin_data_2 = np.random.rand(100, 10).astype(np.float32)
            self.assertFalse(np.equal(origin_data, origin_data_2).all())
            EmbeddingService.update_embedding(keys,
                                              origin_data_2,
                                              embedding_endpoint,
                                              set_if_not_exist=True)
            lookup_data, unknown_keys_idx = EmbeddingService.lookup_embedding(
                keys, embedding_endpoint, parse_type=np.float32)
            lookup_data = np.concatenate(lookup_data, axis=0)
            lookup_data = lookup_data.reshape((output_length, -1))
            self.assertTrue(np.equal(origin_data, lookup_data).all())
            self.assertFalse(np.equal(origin_data_2, lookup_data).all())

            # Test non-exist keys
            keys_do_not_exist = ["test_no_exist_%d" % i for i in range(10)]
            lookup_data, unknown_keys_idx = EmbeddingService.lookup_embedding(
                keys_do_not_exist, embedding_endpoint, parse_type=np.float32)
            self.assertTrue(len(unknown_keys_idx) == 10)
            self.assertTrue(len(lookup_data) == 10)
            # Close
            self.assertTrue(embedding_service.stop_embedding_service())
    def _lookup_embeddings_and_slots(self, grads_and_vars):
        """Look up embedding vectors and slot values form kv store.

        This function looks up embedding vectors and slot values.
        It initializes unknown slot if exist.

        Arguments:
            grads_and_vars: A list of (gradient, layer name) pairs.

        Returns:
            A tuple of (`embedding_values`, `slot_values`). `embedding_values`
            is a python dictionary of {layer name: `embedding_vectors`} where
            `embedding_vectors` is a 2D `numpy.ndarray`. `slot_values` is a
            python dictionary of {layer name: {slot name: `slot_values`}}
            where `slot_values` is a 2D `numpy.ndarray`.

        Raises:
            RuntimeError: If any unknown embedding key exists.
        """

        arr = self._generate_lookup_keys(grads_and_vars)
        embed_keys, slot_keys, embed_key_index, slot_key_index = arr

        keys = embed_keys + slot_keys
        embed_keys_num = len(embed_keys)
        values, unknown_keys = EmbeddingService.lookup_embedding(
            keys=keys, embedding_service_endpoint=self._kv_store_endpoint
        )

        if unknown_keys:
            # raise Error if an unknown embedding key exists
            if unknown_keys[0] < embed_keys_num:
                raise RuntimeError(
                    "Failed to get key %s from kv store."
                    % embed_keys[unknown_keys[0]]
                )

            # initialize unknown slots
            for idx in unknown_keys:
                key = keys[idx]
                layer_name = _get_embedding_layer_name_from_key(key)
                slot_name = _get_slot_name_from_key(key)
                values[idx] = self._initialize_unknown_slot(
                    layer_name, slot_name
                )

        embed_values = _parse_lookup_values(
            values[:embed_keys_num], embed_key_index
        )
        slot_values = _parse_lookup_values(
            values[embed_keys_num:], slot_key_index
        )
        return embed_values, slot_values
Beispiel #6
0
    def test_lookup_embedding(self):
        mock_embedding_service = MockEmbeddingService()

        ids = [1, 2, 3, 4, 5, 6]
        layer_name = "test_edlembedding"
        embedding_table_dim = 10
        mock_embedding_service.mock_embedding_table = {
            "test_edlembedding-1": np.zeros(
                (1, embedding_table_dim), dtype=np.float32
            ),
            "test_edlembedding-2": np.zeros(
                (1, embedding_table_dim), dtype=np.float32
            ),
            "test_edlembedding-3": np.zeros(
                (1, embedding_table_dim), dtype=np.float32
            ),
        }
        worker = Worker(
            1,
            JobType.TRAINING_ONLY,
            32,
            _model_zoo_path,
            model_def="embedding_test_module.CustomModel",
            channel=None,
        )
        with mock.patch.object(
            EmbeddingService,
            "lookup_embedding",
            mock_embedding_service.mock_lookup_embedding,
        ), mock.patch.object(
            EmbeddingService,
            "update_embedding",
            mock_embedding_service.mock_update_embedding,
        ):
            e_lookup, e_unknown = EmbeddingService.lookup_embedding(
                keys=["-".join([layer_name, str(id)]) for id in ids]
            )
            lookup_result = worker.lookup_embedding(
                ids=ids,
                layer_name=layer_name,
                embedding_table_dim=embedding_table_dim,
            )
            self.assertTrue(len(e_lookup) == 6)
            self.assertTrue(len(e_unknown) == 3)
            self.assertTrue(len(lookup_result) == 6)
            self.assertFalse(None in lookup_result)
 def lookup_func(ids, layer_name, initializer, output_dim):
     values, unknown = EmbeddingService.lookup_embedding(
         [Embedding.get_key([layer_name, i]) for i in ids]
     )
     return np.concatenate(values).reshape(len(ids), -1)