def testBowEncoderSparseTensor(self): with self.cached_session() as sess: docs = [[0, 1], [2, 3]] sparse_docs = sparse_ops.dense_to_sparse_tensor(docs) enc = encoders.bow_encoder(sparse_docs, 4, 3) sess.run(variables.global_variables_initializer()) self.assertAllEqual([2, 3], enc.eval().shape)
def bow_encoder(ids, vocab_size, embed_dim, sparse_lookup=True, initializer=None, regularizer=None, trainable=True, scope=None, reuse=None): """Maps a sequence of symbols to a vector per example by averaging embeddings. Args: ids: `[batch_size, doc_length]` `Tensor` or `SparseTensor` of type `int32` or `int64` with symbol ids. vocab_size: Integer number of symbols in vocabulary. embed_dim: Integer number of dimensions for embedding matrix. sparse_lookup: `bool`, if `True`, converts ids to a `SparseTensor` and performs a sparse embedding lookup. This is usually faster, but not desirable if padding tokens should have an embedding. Empty rows are assigned a special embedding. initializer: An initializer for the embeddings, if `None` default for current scope is used. regularizer: Optional regularizer for the embeddings. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional string specifying the variable scope for the op, required if `reuse=True`. reuse: If `True`, variables inside the op will be reused. Returns: Encoding `Tensor` `[batch_size, embed_dim]` produced by averaging embeddings. Raises: ValueError: If `embed_dim` or `vocab_size` are not specified. """ if not vocab_size or not embed_dim: raise ValueError('Must specify vocab size and embedding dimension') with variable_scope.variable_scope(scope, 'bow_encoder', [ids], reuse=reuse): embeddings = variables.model_variable('embeddings', shape=[vocab_size, embed_dim], initializer=initializer, regularizer=regularizer, trainable=trainable) if sparse_lookup: if isinstance(ids, sparse_tensor.SparseTensor): sparse_ids = ids else: sparse_ids = sparse_ops.dense_to_sparse_tensor(ids) return contrib_embedding_ops.safe_embedding_lookup_sparse( [embeddings], sparse_ids, combiner='mean', default_id=0) else: if isinstance(ids, sparse_tensor.SparseTensor): raise TypeError('ids are expected to be dense Tensor, got: %s', ids) return math_ops.reduce_mean(embedding_ops.embedding_lookup( embeddings, ids), axis=1)
def test_dense_to_sparse_tensor_2d(self): with self.cached_session() as sess: st = sparse_ops.dense_to_sparse_tensor([[1, 2, 0, 0], [3, 4, 5, 0]]) result = sess.run(st) self.assertAllEqual([[0, 0], [0, 1], [1, 0], [1, 1], [1, 2]], result.indices) self.assertAllEqual([1, 2, 3, 4, 5], result.values) self.assertAllEqual([2, 4], result.dense_shape)
def test_dense_to_sparse_tensor_unknown_1d_shape(self): with self.cached_session() as sess: tensor = array_ops.placeholder(shape=[None], dtype=dtypes.int32) st = sparse_ops.dense_to_sparse_tensor(tensor) result = sess.run(st, feed_dict={tensor: [0, 100, 0, 3]}) self.assertAllEqual([[1], [3]], result.indices) self.assertAllEqual([100, 3], result.values) self.assertAllEqual([4], result.dense_shape)
def test_dense_to_sparse_unknown_rank(self): ph = array_ops.placeholder(dtype=dtypes.int32) with self.cached_session() as sess: st = sparse_ops.dense_to_sparse_tensor(ph) result = sess.run(st, feed_dict={ph: [[1, 2, 0, 0], [3, 4, 5, 0]]}) self.assertAllEqual([[0, 0], [0, 1], [1, 0], [1, 1], [1, 2]], result.indices) self.assertAllEqual([1, 2, 3, 4, 5], result.values) self.assertAllEqual([2, 4], result.dense_shape)
def test_dense_to_sparse_tensor_1d_str(self): with self.cached_session() as sess: st = sparse_ops.dense_to_sparse_tensor([b'qwe', b'', b'ewq', b'']) result = sess.run(st) self.assertEqual(result.indices.dtype, np.int64) self.assertEqual(result.values.dtype, np.object) self.assertEqual(result.dense_shape.dtype, np.int64) self.assertAllEqual([[0], [2]], result.indices) self.assertAllEqual([b'qwe', b'ewq'], result.values) self.assertAllEqual([4], result.dense_shape)
def test_dense_to_sparse_tensor_1d_bool(self): with self.cached_session() as sess: st = sparse_ops.dense_to_sparse_tensor([True, False, True, False]) result = sess.run(st) self.assertEqual(result.indices.dtype, np.int64) self.assertEqual(result.values.dtype, np.bool) self.assertEqual(result.dense_shape.dtype, np.int64) self.assertAllEqual([[0], [2]], result.indices) self.assertAllEqual([True, True], result.values) self.assertAllEqual([4], result.dense_shape)
def test_dense_to_sparse_tensor_1d_float(self): with self.cached_session() as sess: st = sparse_ops.dense_to_sparse_tensor([1.5, 0.0, 2.3, 0.0]) result = sess.run(st) self.assertEqual(result.indices.dtype, np.int64) self.assertEqual(result.values.dtype, np.float32) self.assertEqual(result.dense_shape.dtype, np.int64) self.assertAllEqual([[0], [2]], result.indices) self.assertAllClose([1.5, 2.3], result.values) self.assertAllEqual([4], result.dense_shape)
def test_dense_to_sparse_tensor_1d(self): with self.cached_session() as sess: st = sparse_ops.dense_to_sparse_tensor([1, 0, 2, 0]) result = sess.run(st) self.assertEqual(result.indices.dtype, np.int64) self.assertEqual(result.values.dtype, np.int32) self.assertEqual(result.dense_shape.dtype, np.int64) self.assertAllEqual([[0], [2]], result.indices) self.assertAllEqual([1, 2], result.values) self.assertAllEqual([4], result.dense_shape)
def test_dense_to_sparse_tensor_unknown_3d_shape(self): with self.cached_session() as sess: tensor = array_ops.placeholder( shape=[None, None, None], dtype=dtypes.int32) st = sparse_ops.dense_to_sparse_tensor(tensor) result = sess.run(st, feed_dict={ tensor: [[[1, 2, 0, 0], [3, 4, 5, 0]], [[7, 8, 0, 0], [9, 0, 0, 0]]] }) self.assertAllEqual([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [0, 1, 2], [1, 0, 0], [1, 0, 1], [1, 1, 0]], result.indices) self.assertAllEqual([1, 2, 3, 4, 5, 7, 8, 9], result.values) self.assertAllEqual([2, 2, 4], result.dense_shape)
def testBowEncoderSparseTensorDenseLookup(self): with self.cached_session(): docs = [[0, 1]] sparse_docs = sparse_ops.dense_to_sparse_tensor(docs) with self.assertRaises(TypeError): encoders.bow_encoder(sparse_docs, 4, 3, sparse_lookup=False)