def test_shared_embedding_column_with_non_sequence_categorical(self): """Tests that error is raised for non-sequence shared embedding column.""" vocabulary_size = 3 sparse_input_a = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] indices=((0, 0), (1, 0), (1, 1)), values=(2, 0, 1), dense_shape=(2, 2)) sparse_input_b = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] indices=((0, 0), (1, 0), (1, 1)), values=(2, 0, 1), dense_shape=(2, 2)) categorical_column_a = fc.categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) categorical_column_b = fc.categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) shared_embedding_columns = fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=2) with self.assertRaisesRegexp( ValueError, r'In embedding_column: aaa_shared_embedding\. categorical_column must ' r'be of type _SequenceCategoricalColumn to use sequence_input_layer\.'): _, _ = sfc.sequence_input_layer( features={ 'aaa': sparse_input_a, 'bbb': sparse_input_b }, feature_columns=shared_embedding_columns)
def test_sequence_length_with_empty_rows(self): """Tests _sequence_length when some examples do not have ids.""" vocabulary_size = 3 sparse_input_a = sparse_tensor.SparseTensorValue( # example 0, ids [] # example 1, ids [2] # example 2, ids [0, 1] # example 3, ids [] # example 4, ids [1] # example 5, ids [] indices=((1, 0), (2, 0), (2, 1), (4, 0)), values=(2, 0, 1, 1), dense_shape=(6, 2)) expected_sequence_length_a = [0, 1, 2, 0, 1, 0] categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) sparse_input_b = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [] # example 2, ids [] # example 3, ids [] # example 4, ids [1] # example 5, ids [0, 1] indices=((0, 0), (4, 0), (5, 0), (5, 1)), values=(2, 1, 0, 1), dense_shape=(6, 2)) expected_sequence_length_b = [1, 0, 0, 0, 1, 2] categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) shared_embedding_columns = fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=2) sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor( _LazyBuilder({ 'aaa': sparse_input_a }))[1] sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor( _LazyBuilder({ 'bbb': sparse_input_b }))[1] with monitored_session.MonitoredSession() as sess: self.assertAllEqual( expected_sequence_length_a, sequence_length_a.eval(session=sess)) self.assertAllEqual( expected_sequence_length_b, sequence_length_b.eval(session=sess))
def test_sequence_length(self): vocabulary_size = 3 sparse_input_a = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] indices=((0, 0), (1, 0), (1, 1)), values=(2, 0, 1), dense_shape=(2, 2)) expected_sequence_length_a = [1, 2] categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) sparse_input_b = sparse_tensor.SparseTensorValue( # example 0, ids [0, 2] # example 1, ids [1] indices=((0, 0), (0, 1), (1, 0)), values=(0, 2, 1), dense_shape=(2, 2)) expected_sequence_length_b = [2, 1] categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) shared_embedding_columns = fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=2) sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor( _LazyBuilder({ 'aaa': sparse_input_a }))[1] sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor( _LazyBuilder({ 'bbb': sparse_input_b }))[1] with monitored_session.MonitoredSession() as sess: sequence_length_a = sess.run(sequence_length_a) self.assertAllEqual(expected_sequence_length_a, sequence_length_a) self.assertEqual(np.int64, sequence_length_a.dtype) sequence_length_b = sess.run(sequence_length_b) self.assertAllEqual(expected_sequence_length_b, sequence_length_b) self.assertEqual(np.int64, sequence_length_b.dtype)
def test_reuse(): data = { 'gender': [['M'], ['G'], ['M'], ['M']], 'user': [['A'], ['B'], ['C'], ['C']], 'pos': [['a'], ['d'], ['f'], ['c']], 'neg': [['c'], ['e'], ['d'], ['a']] } user_v_list = ['A', 'B', 'C', 'D'] item_v_list = ['a', 'b', 'c', 'd', 'e', 'f'] gender_col = feature_column.categorical_column_with_vocabulary_list( 'gender', ['M', "G"], dtype=tf.string) user_col = feature_column.categorical_column_with_vocabulary_list( 'user', user_v_list, dtype=tf.string) pos_item_col = feature_column.categorical_column_with_vocabulary_list( 'pos', item_v_list, dtype=tf.string) neg_item_col = feature_column.categorical_column_with_vocabulary_list( 'neg', item_v_list, dtype=tf.string) gender_embedding = feature_column.embedding_column(gender_col, 2) user_embedding = feature_column.embedding_column(user_col, 2) pos_embedding, neg_embedding = feature_column.shared_embedding_columns( [pos_item_col, neg_item_col], 3) columns = [gender_embedding, user_embedding, pos_embedding, neg_embedding] with tf.variable_scope("a") as scope: aa = scope.name ret = tf.feature_column.input_layer(data, columns) print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=aa)) with tf.variable_scope("b") as scope: bb = scope.name ret1 = tf.feature_column.input_layer(data, columns) print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=bb)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) print(sess.run(ret)) print('------------------') print(sess.run(ret1))
def __init__(self, params, is_trainning=False, dtype=tf.float32, name="word_embedding"): super(EmbeddingLayer, self).__init__(is_trainning, name, dtype) self.params = params self.columns = {} col_dic = {} root = params["voc_file_root"] for feature_name, voc_file in zip( self.params["item_feature_list"], self.params["item_feature_voc_file"]): voc_file = os.path.join(root, voc_file) category_col = categorical_column_with_vocabulary_file( key=feature_name, vocabulary_size=get_vocab_file_size(voc_file), vocabulary_file=voc_file, num_oov_buckets=1) # self.columns[feature_name] = embedding_column(category_col,10) if col_dic.get(voc_file): col_dic.get(voc_file).append((feature_name, category_col)) else: col_dic[voc_file] = [(feature_name, category_col)] for _, value in six.iteritems(col_dic): cols = [] for name, col in value: cols.append(col) shared_cols = shared_embedding_columns(cols, dimension=10) for (name, _), share_col in zip(value, shared_cols): self.columns[name] = share_col voc_file = os.path.join(root, self.params["item_title_voc_file"]) voc_file_size = get_vocab_file_size(voc_file) self.common_embedding_layer = CommonWordEmbedingLayer( voc_file_size, voc_file, embedding_size=self.params["embedding_size"], name="common_embedding")
def test_get_sequence_dense_tensor(self): vocabulary_size = 3 embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 5.), # id 1 (7., 11.) # id 2 ) def _initializer(shape, dtype, partition_info): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values sparse_input_a = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] # example 2, ids [] # example 3, ids [1] indices=((0, 0), (1, 0), (1, 1), (3, 0)), values=(2, 0, 1, 1), dense_shape=(4, 2)) sparse_input_b = sparse_tensor.SparseTensorValue( # example 0, ids [1] # example 1, ids [0, 2] # example 2, ids [0] # example 3, ids [] indices=((0, 0), (1, 0), (1, 1), (2, 0)), values=(1, 0, 2, 0), dense_shape=(4, 2)) expected_lookups_a = [ # example 0, ids [2] [[7., 11.], [0., 0.]], # example 1, ids [0, 1] [[1., 2.], [3., 5.]], # example 2, ids [] [[0., 0.], [0., 0.]], # example 3, ids [1] [[3., 5.], [0., 0.]], ] expected_lookups_b = [ # example 0, ids [1] [[3., 5.], [0., 0.]], # example 1, ids [0, 2] [[1., 2.], [7., 11.]], # example 2, ids [0] [[1., 2.], [0., 0.]], # example 3, ids [] [[0., 0.], [0., 0.]], ] categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) shared_embedding_columns = fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=embedding_dimension, initializer=_initializer) embedding_lookup_a = shared_embedding_columns[0]._get_sequence_dense_tensor( _LazyBuilder({ 'aaa': sparse_input_a }))[0] embedding_lookup_b = shared_embedding_columns[1]._get_sequence_dense_tensor( _LazyBuilder({ 'bbb': sparse_input_b }))[0] global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual(('embedding_weights:0',), tuple([v.name for v in global_vars])) with monitored_session.MonitoredSession() as sess: self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess)) self.assertAllEqual( expected_lookups_a, embedding_lookup_a.eval(session=sess)) self.assertAllEqual( expected_lookups_b, embedding_lookup_b.eval(session=sess))
def test_shared_embedding_column(self): vocabulary_size = 3 sparse_input_a = sparse_tensor.SparseTensorValue( # example 0, ids [2] # example 1, ids [0, 1] indices=((0, 0), (1, 0), (1, 1)), values=(2, 0, 1), dense_shape=(2, 2)) sparse_input_b = sparse_tensor.SparseTensorValue( # example 0, ids [1] # example 1, ids [2, 0] indices=((0, 0), (1, 0), (1, 1)), values=(1, 2, 0), dense_shape=(2, 2)) embedding_dimension = 2 embedding_values = ( (1., 2.), # id 0 (3., 4.), # id 1 (5., 6.) # id 2 ) def _get_initializer(embedding_dimension, embedding_values): def _initializer(shape, dtype, partition_info): self.assertAllEqual((vocabulary_size, embedding_dimension), shape) self.assertEqual(dtypes.float32, dtype) self.assertIsNone(partition_info) return embedding_values return _initializer expected_input_layer = [ # example 0, ids_a [2], ids_b [1] [[5., 6., 3., 4.], [0., 0., 0., 0.]], # example 1, ids_a [0, 1], ids_b [2, 0] [[1., 2., 5., 6.], [3., 4., 1., 2.]], ] expected_sequence_length = [1, 2] categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) # Test that columns are reordered alphabetically. shared_embedding_columns = fc.shared_embedding_columns( [categorical_column_b, categorical_column_a], dimension=embedding_dimension, initializer=_get_initializer(embedding_dimension, embedding_values)) input_layer, sequence_length = sfc.sequence_input_layer( features={ 'aaa': sparse_input_a, 'bbb': sparse_input_b, }, feature_columns=shared_embedding_columns) global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual( ('sequence_input_layer/aaa_bbb_shared_embedding/embedding_weights:0',), tuple([v.name for v in global_vars])) with monitored_session.MonitoredSession() as sess: self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess)) self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess)) self.assertAllEqual( expected_sequence_length, sequence_length.eval(session=sess))