def test_large_batch(self):
    """Tests with large batch size to force multithreading.
    """
    batch_size = 5000
    col1 = []
    col2 = []
    col3 = []
    for b in range(batch_size):
      col1.append(
          ['batch%d-FC1-F1' % b, 'batch%d-FC1-F2' % b, 'batch%d-FC1-F3' % b])
      col2.append(['batch%d-FC2-F1' % b])
      col3.append(['batch%d-FC3-F1' % b, 'batch%d-FC3-F2' % b])

    op = sparse_feature_cross_op.sparse_feature_cross([
        self._sparse_tensor(col1), self._sparse_tensor(col2),
        self._sparse_tensor(col3)
    ])

    col_out = []
    for b in range(batch_size):
      col_out.append([
          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b)
      ])

    expected_out = self._sparse_tensor(col_out)
    with self.test_session() as sess:
      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
Exemple #2
0
  def insert_transformed_feature(self, columns_to_tensors):
    """Handles cross transformation."""

    def _collect_leaf_level_columns(cross):
      """Collects base columns contained in the cross."""
      leaf_level_columns = []
      for c in cross.columns:
        if isinstance(c, _CrossedColumn):
          leaf_level_columns.extend(_collect_leaf_level_columns(c))
        else:
          leaf_level_columns.append(c)
      return leaf_level_columns

    feature_tensors = []
    for c in _collect_leaf_level_columns(self):
      if isinstance(c, _SparseColumn):
        feature_tensors.append(columns_to_tensors[c.name])
      else:
        if c not in columns_to_tensors:
          c.insert_transformed_feature(columns_to_tensors)
        feature_tensors.append(columns_to_tensors[c])
    columns_to_tensors[self] = sparse_feature_cross_op.sparse_feature_cross(
        feature_tensors,
        hashed_output=True,
        num_buckets=self.hash_bucket_size)
Exemple #3
0
 def test_dense(self):
     """Tests only dense inputs.
 """
     op = sparse_feature_cross_op.sparse_feature_cross([
         constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
                               ['batch2-FC1-F1', 'batch2-FC1-F2']],
                              dtypes.string),
         constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
                               ['batch2-FC2-F1', 'batch2-FC2-F2']],
                              dtypes.string),
     ])
     expected_out = self._sparse_tensor(
         [[
             'batch1-FC1-F1_X_batch1-FC2-F1',
             'batch1-FC1-F1_X_batch1-FC2-F2',
             'batch1-FC1-F2_X_batch1-FC2-F1',
             'batch1-FC1-F2_X_batch1-FC2-F2'
         ],
          [
              'batch2-FC1-F1_X_batch2-FC2-F1',
              'batch2-FC1-F1_X_batch2-FC2-F2',
              'batch2-FC1-F2_X_batch2-FC2-F1',
              'batch2-FC1-F2_X_batch2-FC2-F2'
          ]])
     with self.cached_session() as sess:
         self._assert_sparse_tensor_equals(expected_out, sess.run(op))
  def insert_transformed_feature(self, columns_to_tensors):
    """Handles cross transformation."""

    def _collect_leaf_level_columns(cross):
      """Collects base columns contained in the cross."""
      leaf_level_columns = []
      for c in cross.columns:
        if isinstance(c, _CrossedColumn):
          leaf_level_columns.extend(_collect_leaf_level_columns(c))
        else:
          leaf_level_columns.append(c)
      return leaf_level_columns

    feature_tensors = []
    for c in _collect_leaf_level_columns(self):
      if isinstance(c, _SparseColumn):
        feature_tensors.append(columns_to_tensors[c.name])
      else:
        if c not in columns_to_tensors:
          c.insert_transformed_feature(columns_to_tensors)
        feature_tensors.append(columns_to_tensors[c])
    columns_to_tensors[self] = sparse_feature_cross_op.sparse_feature_cross(
        feature_tensors,
        hashed_output=True,
        num_buckets=self.hash_bucket_size)
Exemple #5
0
  def test_large_batch(self):
    """Tests with large batch size to force multithreding.
    """
    batch_size = 5000
    col1 = []
    col2 = []
    col3 = []
    for b in range(batch_size):
      col1.append(
          ['batch%d-FC1-F1' % b, 'batch%d-FC1-F2' % b, 'batch%d-FC1-F3' % b])
      col2.append(['batch%d-FC2-F1' % b])
      col3.append(['batch%d-FC3-F1' % b, 'batch%d-FC3-F2' % b])

    op = sparse_feature_cross_op.sparse_feature_cross([
        self._sparse_tensor(col1), self._sparse_tensor(col2),
        self._sparse_tensor(col3)
    ])

    col_out = []
    for b in range(batch_size):
      col_out.append([
          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b)
      ])

    expected_out = self._sparse_tensor(col_out)
    with self.test_session() as sess:
      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
  def test_all_columns_empty(self):
    """Tests when all columns are empty.

    The crossed tensor should be empty.
    """
    op = sparse_feature_cross_op.sparse_feature_cross([
        self._sparse_tensor([]), self._sparse_tensor([]),
        self._sparse_tensor([])
    ])
    with self.test_session() as sess:
      self._assert_sparse_tensor_empty(sess.run(op))
Exemple #7
0
  def test_all_columns_empty(self):
    """Tests when all columns are empty.

    The crossed tensor should be empty.
    """
    op = sparse_feature_cross_op.sparse_feature_cross([
        self._sparse_tensor([]), self._sparse_tensor([]),
        self._sparse_tensor([])
    ])
    with self.test_session() as sess:
      self._assert_sparse_tensor_empty(sess.run(op))
Exemple #8
0
    def test_one_column_empty(self):
        """Tests when one column is empty.

    The crossed tensor should be empty.
    """
        op = sparse_feature_cross_op.sparse_feature_cross([
            self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']]),
            self._sparse_tensor([], 1),
            self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
        ])
        with self.cached_session() as sess:
            self._assert_sparse_tensor_empty(sess.run(op))
  def test_one_column_empty(self):
    """Tests when one column is empty.

    The crossed tensor should be empty.
    """
    op = sparse_feature_cross_op.sparse_feature_cross([
        self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']]),
        self._sparse_tensor([], 1),
        self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
    ])
    with self.test_session() as sess:
      self._assert_sparse_tensor_empty(sess.run(op))
Exemple #10
0
 def test_integer_mixed_string_sparse(self):
   """Tests mixed type."""
   op = sparse_feature_cross_op.sparse_feature_cross([
       self._sparse_tensor([[11], [333, 55555]]),
       self._sparse_tensor([['batch1-FC2-F1'],
                            ['batch2-FC2-F1', 'batch2-FC2-F2']])
   ])
   expected_out = self._sparse_tensor([['11_X_batch1-FC2-F1'], [
       '333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2', '55555_X_batch2-FC2-F1',
       '55555_X_batch2-FC2-F2'
   ]])
   with self.test_session() as sess:
     self._assert_sparse_tensor_equals(expected_out, sess.run(op))
Exemple #11
0
 def test_hashed_output_v1_has_collision(self):
     """Tests the old version of the fingerprint concatenation has collisions.
 """
     # The last 10 bits of 359 and 1024+359 are identical.
     # As a result, all the crosses collide.
     t1 = constant_op.constant([[359], [359 + 1024]])
     t2 = constant_op.constant([list(range(10)), list(range(10))])
     cross = sparse_feature_cross_op.sparse_feature_cross(
         [t2, t1], hashed_output=True, num_buckets=1024)
     cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
     with session.Session():
         values = cross_dense.eval()
         self.assertTrue(numpy.equal(values[0], values[1]).all())
Exemple #12
0
 def test_hashed_output_zero_bucket(self):
     """Tests a simple scenario.
 """
     op = sparse_feature_cross_op.sparse_feature_cross([
         self._sparse_tensor([['batch1-FC1-F1']]),
         self._sparse_tensor([['batch1-FC2-F1']]),
         self._sparse_tensor([['batch1-FC3-F1']])
     ],
                                                       hashed_output=True)
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[3735511728867393167]])
     with self.cached_session() as sess:
         self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 def test_integer_mixed_string_sparse(self):
   """Tests mixed type."""
   op = sparse_feature_cross_op.sparse_feature_cross([
       self._sparse_tensor([[11], [333, 55555]]),
       self._sparse_tensor([['batch1-FC2-F1'],
                            ['batch2-FC2-F1', 'batch2-FC2-F2']])
   ])
   expected_out = self._sparse_tensor([['11_X_batch1-FC2-F1'], [
       '333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2', '55555_X_batch2-FC2-F1',
       '55555_X_batch2-FC2-F2'
   ]])
   with self.test_session() as sess:
     self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 def test_hashed_output_v1_has_collision(self):
   """Tests the old version of the fingerprint concatenation has collisions.
   """
   # The last 10 bits of 359 and 1024+359 are identical.
   # As a result, all the crosses collide.
   t1 = constant_op.constant([[359], [359 + 1024]])
   t2 = constant_op.constant([list(range(10)), list(range(10))])
   cross = sparse_feature_cross_op.sparse_feature_cross(
       [t2, t1], hashed_output=True, num_buckets=1024)
   cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
   with session.Session():
     values = cross_dense.eval()
     self.assertTrue(numpy.equal(values[0], values[1]).all())
 def test_hashed_output_zero_bucket(self):
   """Tests a simple scenario.
   """
   op = sparse_feature_cross_op.sparse_feature_cross(
       [
           self._sparse_tensor([['batch1-FC1-F1']]),
           self._sparse_tensor([['batch1-FC2-F1']]),
           self._sparse_tensor([['batch1-FC3-F1']])
       ],
       hashed_output=True)
   # Check actual hashed output to prevent unintentional hashing changes.
   expected_out = self._sparse_tensor([[3735511728867393167]])
   with self.test_session() as sess:
     self._assert_sparse_tensor_equals(expected_out, sess.run(op))
Exemple #16
0
 def test_integer_sparse_input(self):
   """Tests mixed type sparse and dense inputs."""
   op = sparse_feature_cross_op.sparse_feature_cross([
       self._sparse_tensor([[11], [333, 5555]]),
       constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
                             ['batch2-FC2-F1', 'batch2-FC2-F2']],
                            dtypes.string),
   ])
   expected_out = self._sparse_tensor(
       [['11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2'], [
           '333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2',
           '5555_X_batch2-FC2-F1', '5555_X_batch2-FC2-F2'
       ]])
   with self.test_session() as sess:
     self._assert_sparse_tensor_equals(expected_out, sess.run(op))
Exemple #17
0
 def test_hashed_output_zero_bucket_v2(self):
     """Tests a simple scenario.
 """
     op = sparse_feature_cross_op.sparse_feature_cross(
         [
             self._sparse_tensor([['batch1-FC1-F1']]),
             self._sparse_tensor([['batch1-FC2-F1']]),
             self._sparse_tensor([['batch1-FC3-F1']])
         ],
         hashed_output=True,
         hash_key=layers.SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY)
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[1971693436396284976]])
     with self.cached_session() as sess:
         self._assert_sparse_tensor_equals(expected_out, sess.run(op))
Exemple #18
0
 def test_simple(self):
   """Tests a simple scenario.
   """
   op = sparse_feature_cross_op.sparse_feature_cross([
       self._sparse_tensor([['batch1-FC1-F1'],
                            ['batch2-FC1-F1', 'batch2-FC1-F2']]),
       self._sparse_tensor([['batch1-FC2-F1'],
                            ['batch2-FC2-F1', 'batch2-FC2-F2']])
   ])
   expected_out = self._sparse_tensor([['batch1-FC1-F1_X_batch1-FC2-F1'], [
       'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
       'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
   ]])
   with self.test_session() as sess:
     self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 def test_simple(self):
   """Tests a simple scenario.
   """
   op = sparse_feature_cross_op.sparse_feature_cross([
       self._sparse_tensor([['batch1-FC1-F1'],
                            ['batch2-FC1-F1', 'batch2-FC1-F2']]),
       self._sparse_tensor([['batch1-FC2-F1'],
                            ['batch2-FC2-F1', 'batch2-FC2-F2']])
   ])
   expected_out = self._sparse_tensor([['batch1-FC1-F1_X_batch1-FC2-F1'], [
       'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
       'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
   ]])
   with self.test_session() as sess:
     self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 def test_hashed_output_zero_bucket_v2(self):
   """Tests a simple scenario.
   """
   op = sparse_feature_cross_op.sparse_feature_cross(
       [
           self._sparse_tensor([['batch1-FC1-F1']]),
           self._sparse_tensor([['batch1-FC2-F1']]),
           self._sparse_tensor([['batch1-FC3-F1']])
       ],
       hashed_output=True,
       hash_key=layers.SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY)
   # Check actual hashed output to prevent unintentional hashing changes.
   expected_out = self._sparse_tensor([[1971693436396284976]])
   with self.test_session() as sess:
     self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 def test_integer_sparse_input(self):
   """Tests mixed type sparse and dense inputs."""
   op = sparse_feature_cross_op.sparse_feature_cross([
       self._sparse_tensor([[11], [333, 5555]]),
       constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
                             ['batch2-FC2-F1', 'batch2-FC2-F2']],
                            dtypes.string),
   ])
   expected_out = self._sparse_tensor(
       [['11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2'], [
           '333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2',
           '5555_X_batch2-FC2-F1', '5555_X_batch2-FC2-F2'
       ]])
   with self.test_session() as sess:
     self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 def test_hashed_output_v2_has_no_collision(self):
   """Tests the new version of the fingerprint concatenation has no collisions.
   """
   # Although the last 10 bits of 359 and 1024+359 are identical.
   # As a result, all the crosses shouldn't collide.
   t1 = constant_op.constant([[359], [359 + 1024]])
   t2 = constant_op.constant([list(range(10)), list(range(10))])
   cross = sparse_feature_cross_op.sparse_feature_cross(
       [t2, t1],
       hashed_output=True,
       num_buckets=1024,
       hash_key=layers.SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY)
   cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
   with session.Session():
     values = cross_dense.eval()
     self.assertTrue(numpy.not_equal(values[0], values[1]).all())
Exemple #23
0
 def test_hashed_output_v2_has_no_collision(self):
     """Tests the new version of the fingerprint concatenation has no collisions.
 """
     # Although the last 10 bits of 359 and 1024+359 are identical.
     # As a result, all the crosses shouldn't collide.
     t1 = constant_op.constant([[359], [359 + 1024]])
     t2 = constant_op.constant([list(range(10)), list(range(10))])
     cross = sparse_feature_cross_op.sparse_feature_cross(
         [t2, t1],
         hashed_output=True,
         num_buckets=1024,
         hash_key=layers.SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY)
     cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
     with session.Session():
         values = cross_dense.eval()
         self.assertTrue(numpy.not_equal(values[0], values[1]).all())
 def test_sparse_cross_dense(self):
   """Tests sparse and dense inputs.
   """
   op = sparse_feature_cross_op.sparse_feature_cross([
       self._sparse_tensor([['batch1-FC1-F1'],
                            ['batch2-FC1-F1', 'batch2-FC1-F2']]),
       constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
                             ['batch2-FC2-F1', 'batch2-FC2-F2']],
                            dtypes.string),
   ])
   expected_out = self._sparse_tensor(
       [['batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2'], [
           'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
           'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
       ]])
   with self.test_session() as sess:
     self._assert_sparse_tensor_equals(expected_out, sess.run(op))
Exemple #25
0
 def test_integer_mixed_string_dense(self):
   """Tests mixed dense inputs.
   """
   op = sparse_feature_cross_op.sparse_feature_cross([
       constant_op.constant([[11, 333], [55555, 999999]], dtypes.int64),
       constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
                             ['batch2-FC2-F1', 'batch2-FC2-F2']],
                            dtypes.string),
   ])
   expected_out = self._sparse_tensor([[
       '11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2', '333_X_batch1-FC2-F1',
       '333_X_batch1-FC2-F2'
   ], [
       '55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2',
       '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2'
   ]])
   with self.test_session() as sess:
     self._assert_sparse_tensor_equals(expected_out, sess.run(op))
Exemple #26
0
 def test_hashed_3x1x2(self):
     """Tests 3x1x2 permutation with hashed output.
 """
     op = sparse_feature_cross_op.sparse_feature_cross([
         self._sparse_tensor(
             [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
         self._sparse_tensor([['batch1-FC2-F1']]),
         self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
     ],
                                                       hashed_output=True,
                                                       num_buckets=1000)
     with self.cached_session() as sess:
         out = sess.run(op)
         self.assertEqual(6, len(out.values))
         self.assertAllEqual([[0, i] for i in range(6)], out.indices)
         self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
         all_values_are_different = len(out.values) == len(set(out.values))
         self.assertTrue(all_values_are_different)
  def test_some_columns_empty(self):
    """Tests when more than one columns are empty.

    Cross for the corresponding batch should be empty.
    """
    op = sparse_feature_cross_op.sparse_feature_cross([
        self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']], 2),
        self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1']], 2),
        self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']], 2)
    ])
    expected_out = self._sparse_tensor([[
        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2'
    ]], 2)
    with self.test_session() as sess:
      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 def test_integer_mixed_string_dense(self):
   """Tests mixed dense inputs.
   """
   op = sparse_feature_cross_op.sparse_feature_cross([
       constant_op.constant([[11, 333], [55555, 999999]], dtypes.int64),
       constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
                             ['batch2-FC2-F1', 'batch2-FC2-F2']],
                            dtypes.string),
   ])
   expected_out = self._sparse_tensor([[
       '11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2', '333_X_batch1-FC2-F1',
       '333_X_batch1-FC2-F2'
   ], [
       '55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2',
       '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2'
   ]])
   with self.test_session() as sess:
     self._assert_sparse_tensor_equals(expected_out, sess.run(op))
Exemple #29
0
    def test_some_columns_empty(self):
        """Tests when more than one columns are empty.

    Cross for the corresponding batch should be empty.
    """
        op = sparse_feature_cross_op.sparse_feature_cross([
            self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']], 2),
            self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1']], 2),
            self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']], 2)
        ])
        expected_out = self._sparse_tensor([[
            'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
            'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
            'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
            'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2'
        ]], 2)
        with self.cached_session() as sess:
            self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 def test_permutation_3x1x2(self):
   """Tests 3x1x2 permutation.
   """
   op = sparse_feature_cross_op.sparse_feature_cross([
       self._sparse_tensor(
           [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
       self._sparse_tensor([['batch1-FC2-F1']]),
       self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
   ])
   expected_out = self._sparse_tensor([[
       'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
       'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
       'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
       'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
       'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
       'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2'
   ]])
   with self.test_session() as sess:
     self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 def test_hashed_3x1x2(self):
   """Tests 3x1x2 permutation with hashed output.
   """
   op = sparse_feature_cross_op.sparse_feature_cross(
       [
           self._sparse_tensor(
               [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
           self._sparse_tensor([['batch1-FC2-F1']]),
           self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
       ],
       hashed_output=True,
       num_buckets=1000)
   with self.test_session() as sess:
     out = sess.run(op)
     self.assertEqual(6, len(out.values))
     self.assertAllEqual([[0, i] for i in range(6)], out.indices)
     self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
     all_values_are_different = len(out.values) == len(set(out.values))
     self.assertTrue(all_values_are_different)
Exemple #32
0
 def test_permutation_3x1x2(self):
     """Tests 3x1x2 permutation.
 """
     op = sparse_feature_cross_op.sparse_feature_cross([
         self._sparse_tensor(
             [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
         self._sparse_tensor([['batch1-FC2-F1']]),
         self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
     ])
     expected_out = self._sparse_tensor([[
         'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
         'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
         'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
         'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
         'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
         'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2'
     ]])
     with self.cached_session() as sess:
         self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def _sampled_scattered_embedding_lookup(
    params, values, dimension=None, sampled_candidates=None, hash_key=None,
    name=None):
  """Looks up embeddings using parameter hashing for each value in `values`.

  This method looks up selected embedding dimensions if `sampled_candidates` is
  given, otherwise looks up all dimensions.

  The i-th embedding component of a value v in `values` is found by retrieving
  the weight whose index is a fingerprint of the pair (v,i).
  The concept is explored as "feature hashing" for model compression in this
  paper: http://arxiv.org/pdf/1504.04788.pdf

  Feature hashing has the pleasant effect of allowing us to compute an embedding
  without needing a pre-determined vocabulary, relieving some amount of process
  complexity. It also allows for us to maintain embeddings for possibly
  trillions of features with a fixed amount of memory.

  Note that this is superior to out-of-vocabulary shared "hash buckets" in that
  the embedding is extremely likely to be unique for each token as opposed to
  being shared across probably-colliding tokens. The price is that we must
  compute a hash once for each scalar in the token's embedding as opposed to
  once per token.

  If `params` is a list, it represents a partition of the embedding parameters.
  Each tensor in the list should have the same length, except for the first ones
  which may have an additional element. For instance 10 parameters can be
  partitioned in 4 tensors with length `[3, 3, 2, 2]`.

  Args:
    params: A `Tensor`, `list` of `Tensors`, or `PartitionedVariable`.
      Each tensor must be of rank 1 with fully-defined shape.
    values: `Tensor` of values to be embedded with shape `[d0, ..., dn]`.
    dimension: Embedding dimension. The user must specify either `dimension` or
      `sampled_candidates`.
    sampled_candidates: An optional `Tensor` of slice indices to keep along the
      final dimension with shape `[d0, ..., dn, N]`. If given, `dimension` is
      ignored. If `None`, looks up all candidates.
    hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
      function to combine the crosses fingerprints on SparseFeatureCrossOp
      (optional).
    name: An optional name for this op.

  Returns:
    A `Tensor` with shape `[d0, ..., dn, dimension]`.
    If `sampled_candidates` is given, the output shape is `[d0, ..., dn, N]`

  Raises:
    ValueError: if dimension is not positive or the partition size is invalid.
  """
  if isinstance(params, variables.PartitionedVariable):
    params = list(params)
  if not isinstance(params, list):
    params = [params]

  with ops.name_scope(name, "scattered_embedding_lookup",
                      params + [dimension, values]):
    # Flatten the values
    values_shape = array_ops.shape(values)
    values = array_ops.reshape(values, [-1, 1])

    if sampled_candidates is None:
      if dimension is None:
        raise ValueError(
            "You must specify either dimension or sampled_candidates.")
      if dimension <= 0:
        raise ValueError("Dimension must be >0. Given is %d" % dimension)
      sampled_candidates = array_ops.tile(array_ops.expand_dims(
          math_ops.range(0, dimension), 0), array_ops.shape(values))
    else:
      dimension = array_ops.shape(sampled_candidates)[
          math_ops.subtract(array_ops.rank(sampled_candidates), 1)]
      sampled_candidates_shape = array_ops.shape(sampled_candidates)
      dimension_tensor = array_ops.reshape(dimension, shape=[1,])
      expected_shape = array_ops.concat([values_shape, dimension_tensor], 0)
      with ops.control_dependencies([control_flow_ops.Assert(
          math_ops.reduce_all(math_ops.equal(sampled_candidates_shape,
                                             expected_shape)),
          ["The shape of sampled_candidates: ", sampled_candidates_shape,
           " does not match the shape of values: ", values_shape])]):
        # Flatten sampled_candidates, same way as values are flattened.
        sampled_candidates = array_ops.reshape(sampled_candidates,
                                               [-1, dimension])

    num_partitions = len(params)
    partition_sizes = []
    for p in range(num_partitions):
      shape = params[p].get_shape()
      shape.assert_has_rank(1)
      shape.assert_is_fully_defined()
      partition_sizes.append(shape[0].value)
    num_params = sum(partition_sizes)  # Total number of parameters.

    # Assert the size of each partition.
    for p in range(num_partitions):
      expected_size = (num_params - p - 1) // num_partitions + 1
      if partition_sizes[p] != expected_size:
        raise ValueError("Tensor %d in params has size %d, expected %d." %
                         (p, partition_sizes[p], expected_size))

    # With two values v1 and v2 and 3 dimensions, we will cross
    # [[0, 1, 2], [0, 1, 2]] with [[v1], [v2]].
    tensors_to_cross = [sampled_candidates, values]
    ids = sparse_feature_cross_op.sparse_feature_cross(
        tensors_to_cross, hashed_output=True, num_buckets=num_params,
        hash_key=hash_key)
    ids = sparse_ops.sparse_tensor_to_dense(ids)

    # No need to validate the indices since we have checked the params
    # dimensions and we know the largest id.
    result = embedding_ops.embedding_lookup(
        params, ids, partition_strategy="div")

    return array_ops.reshape(result,
                             array_ops.concat([values_shape, [dimension]], 0))
Exemple #34
0
def hashed_embedding_lookup(params, values, dimension, name=None):
    """Looks up embeddings using parameter hashing for each value in `values`.

  The i-th embedding component of a value v in `values` is found by retrieving
  the weight whose index is a fingerprint of the pair (v,i).
  The concept is explored as "feature hashing" for model compression in this
  paper: http://arxiv.org/pdf/1504.04788.pdf

  Feature hashing has the pleasant effect of allowing us to compute an embedding
  without needing a pre-determined vocabulary, relieving some amount of process
  complexity. It also allows for us to maintain embeddings for possibly
  trillions of features with a fixed amount of memory.

  Note that this is superior to out-of-vocabulary shared "hash buckets" in that
  the embedding is extremely likely to be unique for each token as opposed to
  being shared across probably-colliding tokens. The price is that we must
  compute a hash once for each scalar in the token's embedding as opposed to
  once per token.

  If `params` is a list, it represents a partition of the embedding parameters.
  Each tensor in the list should have the same length, except for the first ones
  which may have an additional element. For instance 10 parameters can be
  partitioned in 4 tensors with length `[3, 3, 2, 2]`.

  Args:
    params: A `Tensor`, `list` of `Tensors`, or `PartitionedVariable`.
      Each tensor must be of rank 1 with fully-defined shape.
    values: `Tensor` of values to be embedded.
    dimension: Embedding dimension
    name: An optional name for this op.

  Returns:
    A tensor with shape [d0, ..., dn, dimension]
      with shape(values) = [d0, ..., dn]

  Raises:
    ValueError: if dimension is not positive or the partition size is invalid.
  """
    if isinstance(params, variables.PartitionedVariable):
        params = list(params)
    if not isinstance(params, list):
        params = [params]

    with ops.name_scope(name, "hashed_embedding_lookup",
                        params + [dimension, values]):
        if dimension <= 0:
            raise ValueError("Dimension should be >0 not %d" % dimension)

        num_partitions = len(params)
        partition_sizes = []
        for p in range(num_partitions):
            shape = params[p].get_shape()
            shape.assert_has_rank(1)
            shape.assert_is_fully_defined()
            partition_sizes.append(shape[0].value)
        num_params = sum(partition_sizes)  # Total number of parameters.

        # Assert the size of each partition.
        for p in range(num_partitions):
            expected_size = (num_params - p - 1) // num_partitions + 1
            if partition_sizes[p] != expected_size:
                raise ValueError(
                    "Tensor %d in params has size %d, expected %d." %
                    (p, partition_sizes[p], expected_size))

        # Flatten the values
        values_shape = array_ops.shape(values)
        values = array_ops.reshape(values, [-1, 1])

        # With two values v1 and v2 and 3 dimensions, we will cross
        # [[0, 1, 2], [0, 1, 2]] with [[v1], [v2]].
        tensors_to_cross = [
            array_ops.tile(
                array_ops.expand_dims(math_ops.range(0, dimension), 0),
                array_ops.shape(values)), values
        ]
        ids = sparse_feature_cross_op.sparse_feature_cross(
            tensors_to_cross, hashed_output=True, num_buckets=num_params)
        ids = sparse_ops.sparse_tensor_to_dense(ids)

        # No need to validate the indices since we have checked the params
        # dimensions and we know the largest id.
        result = embedding_ops.embedding_lookup(params,
                                                ids,
                                                partition_strategy="div",
                                                validate_indices=False)

        return array_ops.reshape(
            result, array_ops.concat(0, [values_shape, [dimension]]))
Exemple #35
0
def hashed_embedding_lookup(params, values, dimension, name=None):
  """Looks up embeddings using parameter hashing for each value in `values`.

  The i-th embedding component of a value v in `values` is found by retrieving
  the weight whose index is a fingerprint of the pair (v,i).
  The concept is explored as "feature hashing" for model compression in this
  paper: http://arxiv.org/pdf/1504.04788.pdf

  Feature hashing has the pleasant effect of allowing us to compute an embedding
  without needing a pre-determined vocabulary, relieving some amount of process
  complexity. It also allows for us to maintain embeddings for possibly
  trillions of features with a fixed amount of memory.

  Note that this is superior to out-of-vocabulary shared "hash buckets" in that
  the embedding is extremely likely to be unique for each token as opposed to
  being shared across probably-colliding tokens. The price is that we must
  compute a hash once for each scalar in the token's embedding as opposed to
  once per token.

  If `params` is a list, it represents a partition of the embedding parameters.
  Each tensor in the list should have the same length, except for the first ones
  which may have an additional element. For instance 10 parameters can be
  partitioned in 4 tensors with length `[3, 3, 2, 2]`.

  Args:
    params: A `Tensor` or `list` of `Tensors`.
      Each tensor must be of rank 1 with fully-defined shape.
    values: `Tensor` of values to be embedded.
    dimension: Embedding dimension
    name: An optional name for this op.

  Returns:
    A tensor with shape [d0, ..., dn, dimension]
      with shape(values) = [d0, ..., dn]

  Raises:
    ValueError: if dimension is not positive or the partition size is invalid.
  """
  if not isinstance(params, list):
    params = [params]

  with ops.name_scope(name, "hashed_embedding_lookup",
                      params + [dimension, values]):
    if dimension <= 0:
      raise ValueError("Dimension should be >0 not %d" % dimension)

    num_partitions = len(params)
    partition_sizes = []
    for p in range(num_partitions):
      shape = params[p].get_shape()
      shape.assert_has_rank(1)
      shape.assert_is_fully_defined()
      partition_sizes.append(shape[0].value)
    num_params = sum(partition_sizes)  # Total number of parameters.

    # Assert the size of each partition.
    for p in range(num_partitions):
      expected_size = (num_params - p - 1) // num_partitions + 1
      if partition_sizes[p] != expected_size:
        raise ValueError("Tensor %d in params has size %d, expected %d." %
                         (p, partition_sizes[p], expected_size))

    # Flatten the values
    values_shape = array_ops.shape(values)
    values = array_ops.reshape(values, [-1, 1])

    # With two values v1 and v2 and 3 dimensions, we will cross
    # [[0, 1, 2], [0, 1, 2]] with [[v1], [v2]].
    tensors_to_cross = [array_ops.tile(array_ops.expand_dims(
        math_ops.range(0, dimension), 0), array_ops.shape(values)), values]
    ids = sparse_feature_cross_op.sparse_feature_cross(
        tensors_to_cross, hashed_output=True, num_buckets=num_params)
    ids = sparse_ops.sparse_tensor_to_dense(ids)

    # No need to validate the indices since we have checked the params
    # dimensions and we know the largest id.
    result = embedding_ops.embedding_lookup(
        params, ids, partition_strategy="div", validate_indices=False)

    return array_ops.reshape(result, array_ops.concat(
        0, [values_shape, [dimension]]))