def test_shared_embedding_column_with_non_sequence_categorical(self):
    """Tests that error is raised for non-sequence shared embedding column."""
    vocabulary_size = 3
    sparse_input_a = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        indices=((0, 0), (1, 0), (1, 1)),
        values=(2, 0, 1),
        dense_shape=(2, 2))
    sparse_input_b = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        indices=((0, 0), (1, 0), (1, 1)),
        values=(2, 0, 1),
        dense_shape=(2, 2))

    categorical_column_a = fc.categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    categorical_column_b = fc.categorical_column_with_identity(
        key='bbb', num_buckets=vocabulary_size)
    shared_embedding_columns = fc.shared_embedding_columns(
        [categorical_column_a, categorical_column_b], dimension=2)

    with self.assertRaisesRegexp(
        ValueError,
        r'In embedding_column: aaa_shared_embedding\. categorical_column must '
        r'be of type _SequenceCategoricalColumn to use sequence_input_layer\.'):
      _, _ = sfc.sequence_input_layer(
          features={
              'aaa': sparse_input_a,
              'bbb': sparse_input_b
          },
          feature_columns=shared_embedding_columns)
  def test_sequence_length_with_empty_rows(self):
    """Tests _sequence_length when some examples do not have ids."""
    vocabulary_size = 3
    sparse_input_a = sparse_tensor.SparseTensorValue(
        # example 0, ids []
        # example 1, ids [2]
        # example 2, ids [0, 1]
        # example 3, ids []
        # example 4, ids [1]
        # example 5, ids []
        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
        values=(2, 0, 1, 1),
        dense_shape=(6, 2))
    expected_sequence_length_a = [0, 1, 2, 0, 1, 0]
    categorical_column_a = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)

    sparse_input_b = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids []
        # example 2, ids []
        # example 3, ids []
        # example 4, ids [1]
        # example 5, ids [0, 1]
        indices=((0, 0), (4, 0), (5, 0), (5, 1)),
        values=(2, 1, 0, 1),
        dense_shape=(6, 2))
    expected_sequence_length_b = [1, 0, 0, 0, 1, 2]
    categorical_column_b = sfc.sequence_categorical_column_with_identity(
        key='bbb', num_buckets=vocabulary_size)

    shared_embedding_columns = fc.shared_embedding_columns(
        [categorical_column_a, categorical_column_b], dimension=2)

    sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
        _LazyBuilder({
            'aaa': sparse_input_a
        }))[1]
    sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
        _LazyBuilder({
            'bbb': sparse_input_b
        }))[1]

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_sequence_length_a, sequence_length_a.eval(session=sess))
      self.assertAllEqual(
          expected_sequence_length_b, sequence_length_b.eval(session=sess))
  def test_sequence_length(self):
    vocabulary_size = 3

    sparse_input_a = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        indices=((0, 0), (1, 0), (1, 1)),
        values=(2, 0, 1),
        dense_shape=(2, 2))
    expected_sequence_length_a = [1, 2]
    categorical_column_a = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)

    sparse_input_b = sparse_tensor.SparseTensorValue(
        # example 0, ids [0, 2]
        # example 1, ids [1]
        indices=((0, 0), (0, 1), (1, 0)),
        values=(0, 2, 1),
        dense_shape=(2, 2))
    expected_sequence_length_b = [2, 1]
    categorical_column_b = sfc.sequence_categorical_column_with_identity(
        key='bbb', num_buckets=vocabulary_size)
    shared_embedding_columns = fc.shared_embedding_columns(
        [categorical_column_a, categorical_column_b], dimension=2)

    sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
        _LazyBuilder({
            'aaa': sparse_input_a
        }))[1]
    sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
        _LazyBuilder({
            'bbb': sparse_input_b
        }))[1]

    with monitored_session.MonitoredSession() as sess:
      sequence_length_a = sess.run(sequence_length_a)
      self.assertAllEqual(expected_sequence_length_a, sequence_length_a)
      self.assertEqual(np.int64, sequence_length_a.dtype)
      sequence_length_b = sess.run(sequence_length_b)
      self.assertAllEqual(expected_sequence_length_b, sequence_length_b)
      self.assertEqual(np.int64, sequence_length_b.dtype)
Example #4
0
def test_reuse():
    data = {
        'gender': [['M'], ['G'], ['M'], ['M']],
        'user': [['A'], ['B'], ['C'], ['C']],
        'pos': [['a'], ['d'], ['f'], ['c']],
        'neg': [['c'], ['e'], ['d'], ['a']]
    }
    user_v_list = ['A', 'B', 'C', 'D']
    item_v_list = ['a', 'b', 'c', 'd', 'e', 'f']

    gender_col = feature_column.categorical_column_with_vocabulary_list(
        'gender', ['M', "G"], dtype=tf.string)
    user_col = feature_column.categorical_column_with_vocabulary_list(
        'user', user_v_list, dtype=tf.string)
    pos_item_col = feature_column.categorical_column_with_vocabulary_list(
        'pos', item_v_list, dtype=tf.string)
    neg_item_col = feature_column.categorical_column_with_vocabulary_list(
        'neg', item_v_list, dtype=tf.string)

    gender_embedding = feature_column.embedding_column(gender_col, 2)
    user_embedding = feature_column.embedding_column(user_col, 2)
    pos_embedding, neg_embedding = feature_column.shared_embedding_columns(
        [pos_item_col, neg_item_col], 3)
    columns = [gender_embedding, user_embedding, pos_embedding, neg_embedding]

    with tf.variable_scope("a") as scope:
        aa = scope.name
        ret = tf.feature_column.input_layer(data, columns)
    print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=aa))

    with tf.variable_scope("b") as scope:
        bb = scope.name
        ret1 = tf.feature_column.input_layer(data, columns)
    print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=bb))
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        print(sess.run(ret))
        print('------------------')
        print(sess.run(ret1))
Example #5
0
    def __init__(self,
                 params,
                 is_trainning=False,
                 dtype=tf.float32,
                 name="word_embedding"):
        super(EmbeddingLayer, self).__init__(is_trainning, name, dtype)
        self.params = params
        self.columns = {}
        col_dic = {}
        root = params["voc_file_root"]
        for feature_name, voc_file in zip(
                self.params["item_feature_list"],
                self.params["item_feature_voc_file"]):
            voc_file = os.path.join(root, voc_file)
            category_col = categorical_column_with_vocabulary_file(
                key=feature_name,
                vocabulary_size=get_vocab_file_size(voc_file),
                vocabulary_file=voc_file,
                num_oov_buckets=1)
            # self.columns[feature_name] = embedding_column(category_col,10)
            if col_dic.get(voc_file):
                col_dic.get(voc_file).append((feature_name, category_col))
            else:
                col_dic[voc_file] = [(feature_name, category_col)]
        for _, value in six.iteritems(col_dic):
            cols = []
            for name, col in value:
                cols.append(col)
            shared_cols = shared_embedding_columns(cols, dimension=10)
            for (name, _), share_col in zip(value, shared_cols):
                self.columns[name] = share_col

        voc_file = os.path.join(root, self.params["item_title_voc_file"])
        voc_file_size = get_vocab_file_size(voc_file)
        self.common_embedding_layer = CommonWordEmbedingLayer(
            voc_file_size,
            voc_file,
            embedding_size=self.params["embedding_size"],
            name="common_embedding")
  def test_get_sequence_dense_tensor(self):
    vocabulary_size = 3
    embedding_dimension = 2
    embedding_values = (
        (1., 2.),  # id 0
        (3., 5.),  # id 1
        (7., 11.)  # id 2
    )

    def _initializer(shape, dtype, partition_info):
      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
      self.assertEqual(dtypes.float32, dtype)
      self.assertIsNone(partition_info)
      return embedding_values

    sparse_input_a = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        # example 2, ids []
        # example 3, ids [1]
        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
        values=(2, 0, 1, 1),
        dense_shape=(4, 2))
    sparse_input_b = sparse_tensor.SparseTensorValue(
        # example 0, ids [1]
        # example 1, ids [0, 2]
        # example 2, ids [0]
        # example 3, ids []
        indices=((0, 0), (1, 0), (1, 1), (2, 0)),
        values=(1, 0, 2, 0),
        dense_shape=(4, 2))

    expected_lookups_a = [
        # example 0, ids [2]
        [[7., 11.], [0., 0.]],
        # example 1, ids [0, 1]
        [[1., 2.], [3., 5.]],
        # example 2, ids []
        [[0., 0.], [0., 0.]],
        # example 3, ids [1]
        [[3., 5.], [0., 0.]],
    ]

    expected_lookups_b = [
        # example 0, ids [1]
        [[3., 5.], [0., 0.]],
        # example 1, ids [0, 2]
        [[1., 2.], [7., 11.]],
        # example 2, ids [0]
        [[1., 2.], [0., 0.]],
        # example 3, ids []
        [[0., 0.], [0., 0.]],
    ]

    categorical_column_a = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    categorical_column_b = sfc.sequence_categorical_column_with_identity(
        key='bbb', num_buckets=vocabulary_size)
    shared_embedding_columns = fc.shared_embedding_columns(
        [categorical_column_a, categorical_column_b],
        dimension=embedding_dimension,
        initializer=_initializer)

    embedding_lookup_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
        _LazyBuilder({
            'aaa': sparse_input_a
        }))[0]
    embedding_lookup_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
        _LazyBuilder({
            'bbb': sparse_input_b
        }))[0]

    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
    self.assertItemsEqual(('embedding_weights:0',),
                          tuple([v.name for v in global_vars]))
    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
      self.assertAllEqual(
          expected_lookups_a, embedding_lookup_a.eval(session=sess))
      self.assertAllEqual(
          expected_lookups_b, embedding_lookup_b.eval(session=sess))
  def test_shared_embedding_column(self):
    vocabulary_size = 3
    sparse_input_a = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        indices=((0, 0), (1, 0), (1, 1)),
        values=(2, 0, 1),
        dense_shape=(2, 2))
    sparse_input_b = sparse_tensor.SparseTensorValue(
        # example 0, ids [1]
        # example 1, ids [2, 0]
        indices=((0, 0), (1, 0), (1, 1)),
        values=(1, 2, 0),
        dense_shape=(2, 2))

    embedding_dimension = 2
    embedding_values = (
        (1., 2.),  # id 0
        (3., 4.),  # id 1
        (5., 6.)  # id 2
    )

    def _get_initializer(embedding_dimension, embedding_values):

      def _initializer(shape, dtype, partition_info):
        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
        self.assertEqual(dtypes.float32, dtype)
        self.assertIsNone(partition_info)
        return embedding_values

      return _initializer

    expected_input_layer = [
        # example 0, ids_a [2], ids_b [1]
        [[5., 6., 3., 4.], [0., 0., 0., 0.]],
        # example 1, ids_a [0, 1], ids_b [2, 0]
        [[1., 2., 5., 6.], [3., 4., 1., 2.]],
    ]
    expected_sequence_length = [1, 2]

    categorical_column_a = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    categorical_column_b = sfc.sequence_categorical_column_with_identity(
        key='bbb', num_buckets=vocabulary_size)
    # Test that columns are reordered alphabetically.
    shared_embedding_columns = fc.shared_embedding_columns(
        [categorical_column_b, categorical_column_a],
        dimension=embedding_dimension,
        initializer=_get_initializer(embedding_dimension, embedding_values))

    input_layer, sequence_length = sfc.sequence_input_layer(
        features={
            'aaa': sparse_input_a,
            'bbb': sparse_input_b,
        },
        feature_columns=shared_embedding_columns)

    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
    self.assertItemsEqual(
        ('sequence_input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
        tuple([v.name for v in global_vars]))
    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
      self.assertAllEqual(
          expected_sequence_length, sequence_length.eval(session=sess))