def build_model(self, is_training=True): columns = get_feature_columns() item_columns = fc.input_layer(self.other_features, columns['item']) user_columns = fc.input_layer(self.other_features, columns['user']) # 查找同ID的用户聚类特征 self.user_findid = self.table1.lookup( self.other_features['FEA_CtxUid']) self.user_embedding = tf.gather(self.user_feature, self.user_findid) # 用户四种关系分离并进行attention user_split = tf.split(self.user_embedding, 4, 1) rela = tf.stack(user_split, 1) rela_conv = tf.layers.conv1d(rela, 1, 1) coef = tf.expand_dims(user_columns, 1) coef = tf.layers.conv1d(coef, 1, 1) out = tf.multiply(coef, rela_conv) coefs = tf.nn.softmax(tf.nn.tanh(out), 1) res = tf.multiply(coefs, rela) res = tf.reduce_sum(res, 1) item = self.cus_nn(item_columns, None, [ EMBEDDING_NUM * 2, EMBEDDING_NUM], is_training) norm_item = tf.sqrt(tf.reduce_sum(tf.square(item), 1, True)) item_emb = tf.truediv(item, norm_item) # for column in sorted(params['user_columns'], key=lambda x: x.name): # print(column.name) # 产出item embedding if args.mode == 'sample': self.prediction = { 'vid': self.other_features['FEA_SrcItemId'], 'item': item_emb, } else: user = self.cus_nn(user_columns, None, [ EMBEDDING_NUM * 4, EMBEDDING_NUM * 2, EMBEDDING_NUM], is_training) # attenton embedding拼接后送入全连接层 user_columns_out = tf.concat([user_columns, res], axis=1) regularizer = tf.contrib.layers.l2_regularizer(scale=0.1) user = tf.layers.dense( inputs=user_columns_out, units=EMBEDDING_NUM, kernel_initializer=tf.contrib.layers.xavier_initializer(), activation=partial(tf.nn.leaky_relu, alpha=0.2), use_bias=True, kernel_regularizer=regularizer, ) # 残差连接 # user = user + net # user = tf.concat([res, user],axis=1) # user = self.cus_nn(user, None, [EMBEDDING_NUM], is_training) norm_user = tf.sqrt(tf.reduce_sum(tf.square(user), 1, True)) user_emb = tf.truediv(user, norm_user) self.cos_sim_raw = tf.reduce_sum( tf.multiply(user_emb, item_emb), 1, True) self.prob = tf.nn.sigmoid(self.cos_sim_raw)
def test_bucketized_column(): sample = { 'price': [[5.], [16], [25], [36]], 'time': [[2.], [6], [8], [15]] } price_column = feature_column.numeric_column('price') bucket_price = feature_column.bucketized_column(price_column, [10, 20, 30, 40]) price_bucket_tensor = feature_column.input_layer(sample, [bucket_price]) time_column = feature_column.numeric_column('time') bucket_time = feature_column.bucketized_column(time_column, [5, 10, 12]) time_bucket_tensor = feature_column.input_layer(sample, [bucket_time]) with tf.Session() as session: print(session.run([price_bucket_tensor, time_bucket_tensor]))
def test_categorical_column_with_hash_bucket(): #源数据 color_data = {'color': [[2], [5], [-1], [0]]} # 4行样本 shape=[4,1] builder = _LazyBuilder(color_data) # categorical_column color_column = feature_column.categorical_column_with_hash_bucket( 'color', 7, dtype=tf.int32) # tensor color_column_tensor = color_column._get_sparse_tensors(builder) #稀疏表示 with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 通过indicator_column,将稀疏的转换成dense,也就是one-hot形式,只是multi-hot color_column_identy = feature_column.indicator_column(color_column) #input_layer连接数据源和声明的column生成新的tensor color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([color_dense_tensor]))
def test_crossed_column(): """ crossed column测试 """ #源数据 featrues = { 'price': [['A'], ['B'], ['C']], # 0,1,2 'color': [['R'], ['G'], ['B']] # 0,1,2 } # categorical_column price = feature_column.categorical_column_with_vocabulary_list( 'price', ['A', 'B', 'C', 'D']) color = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B']) #crossed_column 产生稀疏表示 p_x_c = feature_column.crossed_column([price, color], 16) # 稠密表示 p_x_c_identy = feature_column.indicator_column(p_x_c) # crossed_column 连接 源数据 p_x_c_identy_dense_tensor = feature_column.input_layer( featrues, [p_x_c_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([p_x_c_identy_dense_tensor]))
def build_mode_norm_test(features, mode, params): # Build the hidden layers, sized according to the 'hidden_units' param. is_training = mode == tf.estimator.ModeKeys.TRAIN fea_net = fc.input_layer(features, params['feature_columns']) fea_net = tf.layers.batch_normalization(fea_net, training=is_training) #x1 = tf.layers.dense(fea_net, units=256, activation=None, use_bias=False) #hidden1 = tf.nn.relu(tf.layers.batch_normalization(x1, training=is_training), name='hidden1') #x2 = tf.layers.dense(hidden1, units=128, activation=None, use_bias=False) #hidden2 = tf.nn.relu(tf.layers.batch_normalization(x2, training=is_training), name='hidden2') #net = tf.layers.dense(hidden2, units=64, activation=tf.tanh, name='user_vector_layer') hidden1 = tf.layers.dense(fea_net, units=128, activation=tf.nn.relu, name='hidden1') #hidden2 = tf.layers.dense(hidden1, units=128, activation=tf.nn.relu, name='hidden2') net = tf.layers.dense(hidden1, units=64, activation=tf.nn.relu, name='user_vector_layer') return net
def build_mode_norm(features, mode, params): # Build the hidden layers, sized according to the 'hidden_units' param. use_batch_norm = params['use_batch_norm'] is_training = mode == tf.estimator.ModeKeys.TRAIN net = fc.input_layer(features, params['feature_columns']) if use_batch_norm: net = tf.layers.batch_normalization(net, training=is_training) for units in params['hidden_units']: if use_batch_norm: x = tf.layers.dense(net, units=units, activation=None, use_bias=False) net = tf.nn.relu( tf.layers.batch_normalization(x, training=is_training)) else: net = tf.layers.dense(net, units=units, activation=tf.nn.relu) if use_batch_norm: x = tf.layers.dense(net, units=params['last_hidden_units'], activation=None, use_bias=False) net = tf.nn.elu(tf.layers.batch_normalization(x, training=is_training), name='user_vector_layer') else: net = tf.layers.dense(net, units=params['last_hidden_units'], activation=tf.nn.relu, name='user_vector_layer') return net
def test_embedding(): tf.set_random_seed(1) #源数据 color_data = { 'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']] } # 4行样本 builder = _LazyBuilder(color_data) # categorical_column 要想转为 embedding 先将源数据的clomn表达为categorical_column 这里只是声明没有源数据 color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) # tensor 数据源 将数据源表达成tensor color_column_tensor = color_column._get_sparse_tensors(builder) #获取embedding_column; 第一个参数是:categorical_column; 第二个参数是维度 color_embedding_column = feature_column.embedding_column(color_column, 4, combiner='sum') # 转化为tensor input_layer(数据源,column) 连接起数据源和embedding_column color_embeding_dense_tensor = feature_column.input_layer( color_data, [color_embedding_column]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) print('embeding' + '_' * 40) print(session.run([color_embeding_dense_tensor]))
def test_categorical_column_with_hash_bucket(): # 1. Input features color_data = {'color': [[2], [5], [-1], [0]]} builder = _LazyBuilder(color_data) # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_hash_bucket( 'color', 7, dtype=tf.int32) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 2. Feature columns (Dense) # Convert the Categorical Column to Dense Column color_column_identity = feature_column.indicator_column(color_column) # 3. Feature tensor color_dense_tensor = feature_column.input_layer(color_data, [color_column_identity]) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([color_dense_tensor]))
def my_model(features, labels, mode, params): net = fc.input_layer(features, params['feature_columns']) # Build the hidden layers, sized according to the 'hidden_units' param. for units in params['hidden_units']: net = tf.layers.dense(net, units=units, activation=tf.nn.relu) if 'dropout_rate' in params and params['dropout_rate'] > 0.0: net = tf.layers.dropout( net, params['dropout_rate'], training=(mode == tf.estimator.ModeKeys.TRAIN)) my_head = tf.contrib.estimator.binary_classification_head(thresholds=[0.5]) # Compute logits (1 per class). logits = tf.layers.dense(net, my_head.logits_dimension, activation=None, name="my_model_output_logits") optimizer = tf.train.AdagradOptimizer( learning_rate=params['learning_rate']) def _train_op_fn(loss): return optimizer.minimize(loss, global_step=tf.train.get_global_step()) return my_head.create_estimator_spec(features=features, mode=mode, labels=labels, logits=logits, train_op_fn=_train_op_fn)
def test_weighted_categorical_column(): # 1. Input features color_data = { 'color': [['R'], ['G'], ['B'], ['A']], 'weight': [[1.0], [2.0], [4.0], [8.0]] } # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) # 2. Feature columns (Sparse) color_weight_categorical_column \ = feature_column.weighted_categorical_column(color_column, 'weight') builder = _LazyBuilder(color_data) id_tensor, weight = color_weight_categorical_column._get_sparse_tensors( builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('weighted categorical' + '-' * 40) print(session.run([id_tensor])) print('-' * 40) print(session.run([weight])) # 2. Feature columns (Dense) weighted_column = feature_column.indicator_column( color_weight_categorical_column) # 3. Feature tensor weighted_column_dense_tensor = feature_column.input_layer( color_data, [weighted_column]) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([weighted_column_dense_tensor]))
def test_shared_embedding_column_with_hash_bucket(): # 1. Input features color_data = { 'range': [[2, 2], [5, 5], [0, -1], [0, 0]], 'id': [[2], [5], [-1], [0]] } builder = _LazyBuilder(color_data) # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_hash_bucket( 'range', 7, dtype=tf.int32) color_column_tensor = color_column._get_sparse_tensors(builder) # 2. Feature columns (Sparse) color_column2 = feature_column.categorical_column_with_hash_bucket( 'id', 7, dtype=tf.int32) color_column_tensor2 = color_column2._get_sparse_tensors(builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('not use input_layer' + '_' * 40) print(session.run([color_column_tensor.id_tensor])) print(session.run([color_column_tensor2.id_tensor])) # 2. Feature columns (Dense) color_column_embed = feature_column.shared_embedding_columns( [color_column2, color_column], 3, combiner='sum') print(type(color_column_embed)) # 3. Feature tensor color_dense_tensor = feature_column.input_layer(color_data, color_column_embed) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run(color_dense_tensor))
def test_embedding(): tf.set_random_seed(1) # 1. Input features color_data = {'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']]} builder = _LazyBuilder(color_data) # 2. Feature columns (Sparse) color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: #session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 2. Feature columns (Dense) color_embedding = feature_column.embedding_column(color_column, 4, combiner='sum') # 3. Feature tensor color_embedding_dense_tensor = feature_column.input_layer( color_data, [color_embedding]) with tf.Session() as session: # Embedding needs variables (weights) to do the embedding session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('embedding' + '_' * 40) print(session.run([color_embedding_dense_tensor]))
def test_categorical_column_with_vocabulary_list(): color_data = { 'color': [['R', 'R'], ['G', 'R'], ['B', 'G'], ['A', 'A']] } # 4行样本 builder = _LazyBuilder(color_data) color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot color_column_identy = feature_column.indicator_column(color_column) color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy]) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run([color_dense_tensor]))
def test_multi_value_embedding(): color_data = { 'color': [['G', 'G'], ['G', 'B'], ['B', 'B'], ['G', 'R'], ['R', 'R'], ['B', 'R']] } color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_embeding = feature_column.embedding_column(color_column, 7) color_embeding_dense_tensor = feature_column.input_layer( color_data, [color_embeding]) builder = _LazyBuilder(color_data) color_column_tensor = color_column._get_sparse_tensors(builder) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('embeding' + '-' * 40) print(session.run([color_embeding_dense_tensor]))
def test_weighted_cate_column(): # !!! id=''代表missing,其对应的weight只能为0,否则会导致id和weight长度不一致而报错 # !!! 而且weight必须是float型,输入int会报错 x_values = { 'id': [[b'a', b'z', b'a', b'c'], [b'b', b'', b'd', b'b']], 'weight': [[1.0, 2.0, -3.0, 4.0], [5.0, 0.0, 7.0, -8.0]] } builder = _LazyBuilder(x_values) # lazy representation of input # ================== define ops sparse_id_featcol = feature_column.categorical_column_with_vocabulary_list( 'id', ['a', 'b', 'c'], dtype=tf.string, default_value=-1) sparse_featcol = feature_column.weighted_categorical_column( categorical_column=sparse_id_featcol, weight_feature_key='weight') x_sparse_tensor = sparse_featcol._get_sparse_tensors(builder) # indicator_column将sparse tensor转换成dense MHE格式, shape=[batch_size, #tokens] # 其中的权重是这个token出现的所有权重的总和 dense_featcol = feature_column.indicator_column(sparse_featcol) x_dense_tensor = feature_column.input_layer(x_values, [dense_featcol]) # ================== run with tf.Session() as sess: # 必须initialize table,否则报错 sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) id_sparse_value, weight_sparse_value = sess.run( [x_sparse_tensor.id_tensor, x_sparse_tensor.weight_tensor]) print("************************* sparse id tensor") # sparse tensor's id_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4] # SparseTensorValue(indices=array( # [[0, 0], # [0, 1], # [0, 2], # [0, 3], # [1, 0], # [1, 2], # [1, 3]]), values=array([ 0, -1, 0, 2, 1, -1, 1]), dense_shape=array([2, 4])) print(id_sparse_value) print("************************* sparse weight tensor") # sparse tensor's weight_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4] # SparseTensorValue(indices=array( # [[0, 0], # [0, 1], # [0, 2], # [0, 3], # [1, 0], # [1, 2], # [1, 3]]), values=array([ 1., 2., -3., 4., 5., 7., -8.], dtype=float32), dense_shape=array([2, 4])) print(weight_sparse_value) print("************************* dense MHE tensor") # indicator_column将sparse tensor按照MHE的方式转化成dense tensor,shape=[batch_size, total_tokens_in_vocab] # 其中的每个数值是该token出现的所有权重的总和 # [[-2. 0. 4.] # [ 0. -3. 0.]] print(sess.run(x_dense_tensor))
def test_shared_embedding_column_with_hash_bucket(): color_data = { 'color': [[2, 2], [5, 5], [0, -1], [0, 0]], 'color2': [[2], [5], [-1], [0]] } # 4行样本 builder = _LazyBuilder(color_data) color_column = feature_column.categorical_column_with_hash_bucket( 'color', 7, dtype=tf.int32) color_column_tensor = color_column._get_sparse_tensors(builder) color_column2 = feature_column.categorical_column_with_hash_bucket( 'color2', 7, dtype=tf.int32) color_column_tensor2 = color_column2._get_sparse_tensors(builder) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('not use input_layer' + '_' * 40) print(session.run([color_column_tensor.id_tensor])) print(session.run([color_column_tensor2.id_tensor])) # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot color_column_embed = feature_column.shared_embedding_columns( [color_column2, color_column], 3, combiner='sum') print(type(color_column_embed)) color_dense_tensor = feature_column.input_layer(color_data, color_column_embed) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('use input_layer' + '_' * 40) print(session.run(color_dense_tensor))
def pratise(): d = {'x': [[32], [16], [38], [98]]} cd = feature_column.numeric_column('x') bcd = feature_column.bucketized_column(cd, [10, 20, 40, 60]) fcd = feature_column.input_layer(d, [bcd]) with tf.Session() as sess: print(sess.run(fcd))
def test_bucketized_column(): price = {'price': [[5.], [15.], [25.], [35.]]} # 4行样本 shape =[4,1] price_column = feature_column.numeric_column('price') bucket_price = feature_column.bucketized_column(price_column, [10, 20, 30, 40]) price_bucket_tensor = feature_column.input_layer(price, [bucket_price]) with tf.Session() as session: print(session.run([price_bucket_tensor]))
def test_weighted_categorical_feature_embedding(): color_data = { 'color': [['R', 'R'], ['G', 'G'], ['B', 'B'], ['G', 'R'], ['G', 'B'], ['B', 'R']], 'weight': [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.3, 0.2], [0.4, 0.3], [0.4, 0.6]] } # 6行样本 color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_embeding = feature_column.embedding_column(color_column, 7, combiner="sum") color_embeding_dense_tensor = feature_column.input_layer( color_data, [color_embeding]) color_weight_categorical_column = feature_column.weighted_categorical_column( color_column, 'weight') color_embeding_weighted = feature_column.embedding_column( color_weight_categorical_column, 7, combiner="sum") color_embeding_dense_tensor_2 = feature_column.input_layer( color_data, [color_embeding_weighted]) builder = _LazyBuilder(color_data) color_column_tensor = color_column._get_sparse_tensors(builder) color_weighted_tensor = color_weight_categorical_column._get_sparse_tensors( builder) ## is a pair (id_tensor, weight_tensor) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print(session.run([color_column_tensor.id_tensor])) print("color column weight:") print(color_column_tensor.weight_tensor) print("color column weighted categorical, weight:") print(session.run([color_weighted_tensor.id_tensor])) print(session.run([color_weighted_tensor.weight_tensor])) with tf.Session() as session: session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) print('embeding' + '-' * 40) print(session.run([color_embeding_dense_tensor])) print('embeding weighted categorical column') print(session.run([color_embeding_dense_tensor_2]))
def build_item_model(features, mode, params): with tf.variable_scope("item_side", partitioner=tf.fixed_size_partitioner(len( FLAGS.ps_hosts.split(",")), axis=0)): item_uuid_embed = fc.input_layer( features, params["feature_configs"].all_columns["itemID"]) item_dense = tf.nn.l2_normalize(item_uuid_embed) return item_dense
def build_mode(features, mode, params, columns): # import pdb;pdb.set_trace() net = fc.input_layer(features, columns) # Build the hidden layers, sized according to the 'hidden_units' param. for units in params['hidden_units']: net = tf.layers.dense(net, units=units, activation=tf.nn.relu) if 'dropout_rate' in params and params['dropout_rate'] > 0.0: net = tf.layers.dropout(net, params['dropout_rate'], training=(mode == tf.estimator.ModeKeys.TRAIN)) return net
def dupn_model_fn(features, labels, mode, params): behvr_emb, property_emb, item_emb = get_behavior_embedding( params, features) print("behvr_emb shape:", behvr_emb.shape) print("property_emb shape:", property_emb.shape) print("item_emb shape:", item_emb.shape) inputs = tf.concat([behvr_emb, property_emb], -1) print("lstm inputs shape:", inputs.shape) lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=params["num_units"]) #initial_state = lstm_cell.zero_state(params["batch_size"], tf.float32) outputs, state = tf.nn.dynamic_rnn(lstm_cell, inputs, dtype=tf.float32) print("lstm output shape:", outputs.shape) masks = tf.cast(features["behaviorPids"] >= 0, tf.float32) user = fc.input_layer(features, params["user_feature_columns"]) context = tf.concat([user, item_emb], -1) print("attention context shape:", context.shape) sequence = attention(outputs, context, params, masks) print("sequence embedding shape:", sequence.shape) other = fc.input_layer(features, params["other_feature_columns"]) net = tf.concat([sequence, item_emb, other], -1) # Build the hidden layers, sized according to the 'hidden_units' param. for units in params['hidden_units']: net = tf.layers.dense(net, units=int(units), activation=tf.nn.relu) if 'dropout_rate' in params and params['dropout_rate'] > 0.0: net = tf.layers.dropout( net, params['dropout_rate'], training=(mode == tf.estimator.ModeKeys.TRAIN)) # Compute logits logits = tf.layers.dense(net, 1, activation=None) optimizer = optimizers.get_optimizer_instance(params["optimizer"], params["learning_rate"]) my_head = tf.contrib.estimator.binary_classification_head(thresholds=[0.5]) return my_head.create_estimator_spec( features=features, mode=mode, labels=labels, logits=logits, train_op_fn=lambda loss: optimizer.minimize( loss, global_step=tf.train.get_global_step()))
def build_mode(features, mode, params): net = fc.input_layer(features, params['feature_columns']) # Build the hidden layers, sized according to the 'hidden_units' param. for units in params['hidden_units']: net = tf.layers.dense(net, units=units, activation=tf.nn.relu) if 'dropout_rate' in params and params['dropout_rate'] > 0.0: net = tf.layers.dropout(net, params['dropout_rate'], training=(mode == tf.estimator.ModeKeys.TRAIN)) # Compute logits logits = tf.layers.dense(net, 1, activation=None) return logits
def practise(): fx = {'x': [['a', 'a'], ['b', 'c'], ['c', 'e'], ['d', ''], ['e', 'f']]} fc = feature_column.categorical_column_with_hash_bucket('x', 5) fic = feature_column.indicator_column(fc) t2 = fc._get_sparse_tensors(_LazyBuilder(fx)).id_tensor tsor = feature_column.input_layer(fx, fic) with tf.Session() as sess: print(sess.run(t2)) print(sess.run(tsor))
def test_identity_feature_column(): sample = {'price': [[1], [2], [3], [0]]} # price_column = feature_column.numeric_column('price') price_column = feature_column.categorical_column_with_identity( key='price', num_buckets=4) indicator = feature_column.indicator_column(price_column) price_column_tensor = feature_column.input_layer(sample, [indicator]) with tf.Session() as session: print(session.run([price_column_tensor]))
def inference(self, feats): # embedding_columns, order_columns, spacetime_columns, user_columns = make_columns() embedding_columns, order_columns, spacetime_columns, user_columns = make_columns_with_normalizer() # TODO: feats 按columns拆分 with tf.name_scope('embedding_columns'): embedding_tensor = fc.input_layer(feats, embedding_columns) with tf.name_scope('order_columns'): order_tensor = fc.input_layer(feats, order_columns) with tf.name_scope('spacetime_columns'): spacetime_tensor = fc.input_layer(feats, spacetime_columns) input_tensor = tf.concat([embedding_tensor, order_tensor, spacetime_tensor], axis=1, name='input_concat') eta_d = tf.concat([net('eta_d_%d'%ix, input_tensor, self.layer_units) for ix in range(self.d_k)], axis=1) eta_c = tf.concat([net('eta_c_%d'%ix, input_tensor, self.layer_units) for ix in range(self.c_k)], axis=1) d_softmax_logits = net('logits_d', input_tensor, self.layer_units, self.d_k) c_softmax_logits = net('logits_c', input_tensor, self.layer_units, self.c_k) return eta_d, eta_c, d_softmax_logits, c_softmax_logits
def test_bucketized_column(): # 1. Input features price = {'price': [[15.], [5.], [35.], [25.]]} # 2. Feature columns (Dense) price_column = feature_column.numeric_column('price') # 2. Feature columns (Dense): bucketized_column is both Dense and # Categorical bucket_price = feature_column.bucketized_column(price_column, [10, 20, 30]) # 3. Feature tensor price_bucket_tensor = feature_column.input_layer(price, [bucket_price]) with tf.Session() as session: print(session.run([price_bucket_tensor]))
def test_cate_featcol_with_vocablist(): # ================== prepare input # 1. 为什么要用b前缀,这是因为通过input_fn读进来的字符串都有b前缀 # 而我要测试的是,如果我传入的vocab list是普通的str,能否与这些b匹配成功 # 2. '' represents missing, feature_column treats them "ignored in sparse tensor, and 0 in dense tensor" # 3. 'z' represents OOV, feature_column treats them "-1 in sparse tensor, and 0 in dense tensor" # 4. duplicates should be merged in dense tensor by summing up their occurrence x_values = {'x': [[b'a', b'z', b'a', b'c'], [b'b', b'', b'd', b'b']]} builder = _LazyBuilder(x_values) # lazy representation of input # ================== define ops sparse_featcol = feature_column.categorical_column_with_vocabulary_list( 'x', ['a', 'b', 'c'], dtype=tf.string, default_value=-1) x_sparse_tensor = sparse_featcol._get_sparse_tensors(builder) # 尽管一行中有重复,但是并没有合并,所以压根就没有weight # 只是导致id_tensor中会出现重复数值而已,而导致embedding_lookup_sparse时出现累加 assert x_sparse_tensor.weight_tensor is None # indicator_column将sparse tensor转换成dense MHE格式,注意第一行有重复,所以结果应该是multi-hot dense_featcol = feature_column.indicator_column(sparse_featcol) x_dense_tensor = feature_column.input_layer(x_values, [dense_featcol]) # ================== run with tf.Session() as sess: # 必须initialize table,否则报错 sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) print("************************* sparse tensor") # 结果证明: # 1. 输入数据用b前缀的字符串,能够匹配上vocab list中的str # 2. 注意第二行只有两个元素,''在sparse tensor中被忽略掉了 # 3. 'z','d'代表oov,sparse tensor中被映射成-1 # 4. sparse tensor的dense_shape与原始输入的shape相同 # [SparseTensorValue(indices=array([[0, 0], # [0, 1], # [0, 2], # [0, 3], # [1, 0], # [1, 2], # [1, 3]]), values=array([0, -1, 0, 2, 1, -1, 1]), dense_shape=array([2, 4]))] print(sess.run([x_sparse_tensor.id_tensor])) print("************************* dense MHE tensor") # 结果证明: # 1. 在dense表示中,duplicates的出现次数被加和,使用MHE # 2. 无论是原始的missing(或许是由padding造成的),还是oov,在dense结果中都不出现 # 3. dense_tensor的shape=[#batch_size, vocab_size] # [[2. 0. 1.] # [0. 2. 0.]] print(sess.run(x_dense_tensor))
def build_model_net(features, mode, params): net = fc.input_layer(features, params['feature_columns']) # net = tf.layers.batch_normalization(net, training=(mode == tf.estimator.ModeKeys.TRAIN)) # Build the hidden layers, sized according to the 'hidden_units' param. for units in params['hidden_units']: net = tf.layers.dense(net, units=units, activation=tf.nn.relu) if 'dropout_rate' in params and params['dropout_rate'] > 0.0: net = tf.layers.dropout( net, params['dropout_rate'], training=(mode == tf.estimator.ModeKeys.TRAIN)) print("net node count", net.shape[-1].value) logits = tf.layers.dense(net, units=1) return logits
def emb(): xb = {'x': [['a', 'b'], ['a', 'c'], ['b', 'c']]} x = { 'x': [['a', 'b'], ['b', 'c'], ['c', ''], ['', '']] } #可以是多值变量, 对于一篇文章而言是好的处理方式 fx = feature_column.categorical_column_with_vocabulary_list( 'x', ['a', 'b', 'c', 'd'], dtype=tf.string, default_value=0) fex = feature_column.embedding_column(fx, 4, 'mean') t = feature_column.input_layer(x, [fex]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) print(sess.run(t))