def cross_layer(x0, cross_layers, cross_op='better'): xl = x0 if cross_op == 'better': cross_func = cross_op_better else: cross_func = cross_op_raw with tf.variable_scope('cross_layer'): feature_size = x0.get_shape().as_list()[ -1] # feature_size = n_feature * embedding_size for i in range(cross_layers): weight = tf.get_variable( shape=[feature_size], initializer=tf.truncated_normal_initializer(), name='cross_weight{}'.format(i)) bias = tf.get_variable( shape=[feature_size], initializer=tf.truncated_normal_initializer(), name='cross_bias{}'.format(i)) interaction = cross_func(xl, x0, weight, feature_size) xl = interaction + bias + xl # add back previous layer -> (batch, feature_size) add_layer_summary('cross_{}'.format(i), xl) return xl
def decode(self, encoder_output, labels, mode): """ Apply decoding func for target sequence. If train, use train decoder, else use infer decoder. Input encoder_output: ENCODER_OUTPUT features: {tokens:, seq_len:} labels: {tokens:, seq_len:} mode: tf.estimator.ModeKeys Return DECODER_OUTPUT """ with tf.variable_scope('decoding'): if mode == tf.estimator.ModeKeys.TRAIN: seq_emb_output = tf.nn.embedding_lookup( self.embedding, labels['tokens']) # batch_size * max_len * emb_size input_len = labels['seq_len'] elif mode == tf.estimator.ModeKeys.EVAL: seq_emb_output = None input_len = labels['seq_len'] else: seq_emb_output = None input_len = None decoder_output = decoder(encoder_output, seq_emb_output, input_len,\ self.embedding, self.params, mode) add_layer_summary('decoder_output.state', decoder_output.state) add_layer_summary('decoder_output.output', decoder_output.output.rnn_output) return decoder_output
def encode(self, features, mode): """ 6 idential layer consisting of multiheaad attention + add&norm + feed forward + add&norm input features: dict {'tokens':, 'seq_len':} output encoder_output: dimension unchanged after transformation """ with tf.variable_scope('encoding', reuse=tf.AUTO_REUSE): encoder_input = self.embedding_func( features['tokens'], mode) # batch * seq_len * emb_size self_mask = seq_mask_gen(features, self.params) for i in range(self.params['encode_attention_layers']): with tf.variable_scope('self_attention_layer_{}'.format(i), reuse=tf.AUTO_REUSE): encoder_input = multi_head_attention(key=encoder_input, query=encoder_input, value=encoder_input, mask=self_mask, params=self.params, mode=mode) add_layer_summary('output', encoder_input) encoder_input = ffn(encoder_input, self.params, mode) add_layer_summary('ffn', encoder_input) return ENCODER_OUTPUT(output=encoder_input, state=encoder_input[:, -1, :])
def input_encode(self, features): with tf.variable_scope('input_encoding', reuse=False): encoder_output = self.general_encoder(features) add_layer_summary('state', encoder_output.state) add_layer_summary('output', encoder_output.output) return encoder_output
def neighbour_cls_loss(encoder_output, decoder_output, labels, params): """ Quick thought like loss function: source is continuous sentence, target are the same as input. positive samples are the pair within widonw_size around diagonal, all the other sample in batch are negative sample """ sim_score = tf.matmul(encoder_output.state[0], decoder_output.state[0], transpose_b=True) # [batch, batch] sim score add_layer_summary(sim_score.name, sim_score) with tf.variable_scope('neighbour_similarity_loss'): batch_size = sim_score.get_shape().as_list()[0] sim_score = tf.matrix_set_diag(sim_score, np.zeros(batch_size))# ignore self-similarity # create targets: set element within diagonal offset to 1 targets = np.zeros(shape=(batch_size, batch_size)) offset = params['window_size'] ## offset of the diagonal for i in chain(range(1, 1+offset), range(-offset, -offset+1)): diag = np.diagonal(targets, offset=i) diag.setflags(write=True) diag.fill(1) targets = targets/np.sum(targets, axis=1, keepdims=True) # normalize target probability to 1 targets = tf.constant(targets, dtype=params['dtype']) losses = tf.nn.softmax_cross_entropy_with_logits(labels=targets, logits=sim_score) losses = tf.reduce_mean(losses) return losses
def scaled_dot_product_attention(key, value, query, mask): """ apply dot product attention with mask input: key: batch_size * key_len * emb_size query: batch_size * query_len * emb_size value: batch_size * key_len * emb_size mask: batch_size * key_len output: weighted_val: batch_size * query_len * emb_size """ with tf.variable_scope('scaled_dot_product_attention', reuse=tf.AUTO_REUSE): # scalaed weight matrix : batch_size * query_len * key_len dk = tf.cast(key.shape.as_list()[-1], tf.float32) # emb_size weight = tf.matmul(query, key, transpose_b=True) / (dk**0.5) # apply mask: large negative will become 0 in softmax[mask=0 ignore] weight += (1 - mask) * (-2**32 + 1) # normalize on axis key_len so that score add up to 1 weight = tf.nn.softmax(weight, axis=-1) tf.summary.image("attention", tf.expand_dims(weight[:1], -1)) # add channel dim add_layer_summary('attention', weight) # weighted value: batch_size * query_len * emb_size weighted_value = tf.matmul(weight, value) return weighted_value
def model_fn(features, labels, mode, params): feature_columns= build_features() input = tf.feature_column.input_layer(features, feature_columns) with tf.variable_scope('init_fm_embedding'): # method1: load from checkpoint directly embeddings = tf.Variable( tf.contrib.framework.load_variable( './checkpoint/FM', 'fm_interaction/v' ) ) weight = tf.Variable( tf.contrib.framework.load_variable( './checkpoint/FM', 'linear/w' ) ) dense = tf.add(tf.matmul(input, embeddings), tf.matmul(input, weight)) add_layer_summary('input', dense) with tf.variable_scope( 'Dense' ): for i, unit in enumerate( params['hidden_units'] ): dense = tf.layers.dense( dense, units=unit, activation='relu', name='dense{}'.format( i ) ) dense = tf.layers.batch_normalization( dense, center=True, scale=True, trainable=True, training=(mode == tf.estimator.ModeKeys.TRAIN) ) dense = tf.layers.dropout( dense, rate=params['dropout_rate'], training=(mode == tf.estimator.ModeKeys.TRAIN) ) add_layer_summary( dense.name, dense ) with tf.variable_scope('output'): y = tf.layers.dense(dense, units= 1, name = 'output') tf.summary.histogram(y.name, y) return y
def multi_head_attention(key, value, query, mask, params, mode): """ Mutlihead attention with mask input: key: batch_size * key_len * emb_size query: batch_size * query_len * emb_size value: batch_size * key_len * emb_size mask: batch_size * key_len output: weighted_val: batch_size * query_len * emb_size """ with tf.variable_scope('multi_head_attention', reuse=tf.AUTO_REUSE): d_model = value.shape.as_list()[-1] # emb_size # linear projection with dimension unchaangned new_key = tf.layers.dense( key, units=d_model, activation=None) # batch_size * key_len * emb_size new_value = tf.layers.dense(value, units=d_model, activation=None) new_query = tf.layers.dense(query, units=d_model, activation=None) # split d_model by num_head and compute attention in parallel # (batch_size * num_head) * key_len * (emb_size/num_head) new_key = tf.concat(tf.split(new_key, num_or_size_splits=params['num_head'], axis=-1), axis=0) new_value = tf.concat(tf.split(new_value, num_or_size_splits=params['num_head'], axis=-1), axis=0) new_query = tf.concat(tf.split(new_query, num_or_size_splits=params['num_head'], axis=-1), axis=0) # calculate dot-product attention weighted_val = scaled_dot_product_attention( new_key, new_value, new_query, tf.tile(mask, [params['num_head'], 1, 1])) # concat num_head back # (batch_size * num_head) * query_len * (emb_size/num_head) -> batch_size * query_len * emb_size weighted_val = tf.concat(tf.split( weighted_val, num_or_size_splits=params['num_head'], axis=0), axis=-1) # Linear projection weighted_val = tf.layers.dense(weighted_val, units=d_model, activation=None) # Do dropout weighted_val = tf.layers.dropout( weighted_val, rate=params['dropout_rate'], training=(mode == tf.estimator.ModeKeys.TRAIN)) add_layer_summary('raw_multi_head', weighted_val) weighted_val = add_and_norm_layer(query, weighted_val) return weighted_val
def encode(self, features): with tf.variable_scope('encoding'): encoder_output = self.general_encoder(features) add_layer_summary('encoder_output.state', encoder_output.state) add_layer_summary('encoder_output.output', encoder_output.output) return encoder_output
def init(self): with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE): self.embedding = tf.get_variable( dtype=self.params['dtype'], initializer=tf.constant(self.params['pretrain_embedding']), name='word_embedding') add_layer_summary(self.embedding.name, self.embedding)
def model_fn(features, labels, mode, params): sparse_columns, dense_columns = build_features(params['numeric_handle']) with tf.variable_scope('EmbeddingInput'): embedding_input = [] for f_sparse in sparse_columns: sparse_input = tf.feature_column.input_layer(features, f_sparse) input_dim = sparse_input.get_shape().as_list()[-1] init = tf.random_normal(shape=[input_dim, params['embedding_dim']]) weight = tf.get_variable('w_{}'.format(f_sparse.name), dtype=tf.float32, initializer=init) add_layer_summary(weight.name, weight) embedding_input.append(tf.matmul(sparse_input, weight)) dense = tf.concat(embedding_input, axis=1, name='embedding_concat') add_layer_summary(dense.name, dense) # if treat numeric feature as dense feature, then concatenate with embedding. else concatenate wtih sparse input if params['numeric_handle'] == 'dense': numeric_input = tf.feature_column.input_layer( features, dense_columns) numeric_input = tf.layers.batch_normalization( numeric_input, center=True, scale=True, trainable=True, training=(mode == tf.estimator.ModeKeys.TRAIN)) add_layer_summary(numeric_input.name, numeric_input) dense = tf.concat([dense, numeric_input], axis=1, name='numeric_concat') add_layer_summary(dense.name, dense) with tf.variable_scope('MLP'): for i, unit in enumerate(params['hidden_units']): dense = tf.layers.dense(dense, units=unit, activation='relu', name='Dense_{}'.format(i)) if mode == tf.estimator.ModeKeys.TRAIN: add_layer_summary(dense.name, dense) dense = tf.layers.dropout( dense, rate=params['dropout_rate'], training=(mode == tf.estimator.ModeKeys.TRAIN)) with tf.variable_scope('output'): y = tf.layers.dense(dense, units=1, name='output') return y
def encode(self, features, mode): """ RNN Encoder """ with tf.variable_scope('encoding', reuse=tf.AUTO_REUSE): encoder_input = self.embedding_func(features['tokens']) encoder_output = rnn_encoder(encoder_input, features['seq_len'], self.params) add_layer_summary('encoder_output.state', encoder_output.state) add_layer_summary('encoder_output.output', encoder_output.output) return encoder_output
def sparse_embedding(feature_size, embedding_size, field_size, feat_ids, feat_vals, add_summary): with tf.variable_scope('Sparse_Embedding'): v = tf.get_variable( shape=[feature_size, embedding_size], initializer=tf.truncated_normal_initializer(), name='embedding_weight' ) embedding_matrix = tf.nn.embedding_lookup( v, feat_ids ) # batch * field_size * embedding_size embedding_matrix = tf.multiply( embedding_matrix, tf.reshape(feat_vals, [-1, field_size,1] ) ) if add_summary: add_layer_summary( 'embedding_matrix', embedding_matrix ) return embedding_matrix
def model_fn_sparse(features, labels, mode, params): # hyper parameter data_params = params['data_params'] field_size = data_params['field_size'] feature_size = data_params['feature_size'] embedding_size = data_params['embedding_size'] # extract feature feat_ids = tf.reshape(features['feat_ids'], shape=[-1, field_size]) # batch * field_size feat_vals = tf.reshape(features['feat_vals'], shape=[-1, field_size]) # batch * field_size # extract embedding with tf.variable_scope('extract_embedding'): embedding_matrix = sparse_embedding( feature_size, embedding_size, field_size, feat_ids, feat_vals, add_summary=True) # (batch, field_size, embedding_size) dense_input = tf.reshape(embedding_matrix, [-1, field_size * embedding_size ]) # (batch, field_size * embedding_size) # linear part linear_output = sparse_linear(feature_size, feat_ids, feat_vals, add_summary=True) # Deep part dense_output = stack_dense_layer(dense_input, params['hidden_units'], params['dropout_rate'], params['batch_norm'], mode, add_summary=True) # CIN part cin_output = cin_layer(embedding_matrix, params['cin_layer_size'], embedding_size, field_size) # concat and output with tf.variable_scope('output'): y = tf.concat([dense_output, cin_output, linear_output], axis=1) y = tf.layers.dense(y, units=1) add_layer_summary('output', y) return y
def model_fn_dense(features, labels, mode, params): dense_feature, sparse_feature = build_features() dense = tf.feature_column.input_layer(features, dense_feature) sparse = tf.feature_column.input_layer(features, sparse_feature) with tf.variable_scope('FM_component'): with tf.variable_scope('Linear'): linear_output = tf.layers.dense(sparse, units=1) add_layer_summary('linear_output', linear_output) with tf.variable_scope('second_order'): # reshape (batch_size, n_feature * emb_size) -> (batch_size, n_feature, emb_size) emb_size = dense_feature[0].variable_shape.as_list()[ 0] # all feature has same emb dimension embedding_matrix = tf.reshape(dense, (-1, len(dense_feature), emb_size)) add_layer_summary('embedding_matrix', embedding_matrix) # Compared to FM embedding here is flatten(x * v) not v sum_square = tf.pow(tf.reduce_sum(embedding_matrix, axis=1), 2) square_sum = tf.reduce_sum(tf.pow(embedding_matrix, 2), axis=1) fm_output = tf.reduce_sum(tf.subtract(sum_square, square_sum) * 0.5, axis=1, keepdims=True) add_layer_summary('fm_output', fm_output) with tf.variable_scope('Deep_component'): for i, unit in enumerate(params['hidden_units']): dense = tf.layers.dense(dense, units=unit, activation='relu', name='dense{}'.format(i)) dense = tf.layers.batch_normalization( dense, center=True, scale=True, trainable=True, training=(mode == tf.estimator.ModeKeys.TRAIN)) dense = tf.layers.dropout( dense, rate=params['dropout_rate'], training=(mode == tf.estimator.ModeKeys.TRAIN)) add_layer_summary(dense.name, dense) with tf.variable_scope('output'): y = dense + fm_output + linear_output add_layer_summary('output', y) return y
def model_fn_dense(features, labels, mode, params): dense_feature, sparse_feature = build_features() dense = tf.feature_column.input_layer( features, dense_feature) # lz linear concat of embedding sparse = tf.feature_column.input_layer(features, sparse_feature) field_size = len(dense_feature) embedding_size = dense_feature[0].variable_shape.as_list()[-1] embedding_matrix = tf.reshape( dense, [-1, field_size, embedding_size]) # batch * field_size *emb_size with tf.variable_scope('Linear_part'): linear_output = tf.layers.dense(sparse, units=1) add_layer_summary('linear_output', linear_output) with tf.variable_scope('Elementwise_Interaction'): elementwise_list = [] for i in range(field_size): for j in range(i + 1, field_size): vi = tf.gather(embedding_matrix, indices=i, axis=1, batch_dims=0, name='vi') # batch * emb_size vj = tf.gather(embedding_matrix, indices=j, axis=1, batch_dims=0, name='vj') elementwise_list.append(tf.multiply(vi, vj)) # batch * emb_size elementwise_matrix = tf.stack( elementwise_list) # (N*(N-1)/2) * batch * emb_size elementwise_matrix = tf.transpose( elementwise_matrix, [1, 0, 2]) # batch * (N*(N-1)/2) * emb_size with tf.variable_scope('Attention_Net'): # 2 fully connected layer dense = tf.layers.dense(elementwise_matrix, units=params['attention_factor'], activation='relu') # batch * (N*(N-1)/2) * t add_layer_summary(dense.name, dense) attention_weight = tf.layers.dense( dense, units=1, activation='softmax') # batch *(N*(N-1)/2) * 1 add_layer_summary(attention_weight.name, attention_weight) with tf.variable_scope('Attention_pooling'): interaction_output = tf.reduce_sum(tf.multiply(elementwise_matrix, attention_weight), axis=1) # batch * emb_size interaction_output = tf.layers.dense(interaction_output, units=1) # batch * 1 with tf.variable_scope('output'): y = interaction_output + linear_output add_layer_summary('output', y) return y
def output_encode(self, features, labels, mode): """ For quick thought, decode will be another encoder with different parameters and do inner-product with encoder Return [batch, batch] inner product of encoder_state * decoder_state """ with tf.variable_scope('output_encoding', reuse=False): if mode == tf.estimator.ModeKeys.PREDICT: encoder_output = self.general_encoder(features) else: encoder_output = self.general_encoder(labels) add_layer_summary('state', encoder_output.state) add_layer_summary('output', encoder_output.output) return encoder_output
def model_fn_dense(features, labels, mode, params): dense_feature, sparse_feature = build_features() dense_input = tf.feature_column.input_layer(features, dense_feature) sparse_input = tf.feature_column.input_layer(features, sparse_feature) # Linear part with tf.variable_scope('Linear_component'): linear_output = tf.layers.dense(sparse_input, units=1) add_layer_summary('linear_output', linear_output) field_size = len(dense_feature) emb_size = dense_feature[0].variable_shape.as_list()[-1] embedding_matrix = tf.reshape(dense_input, [-1, field_size, emb_size]) # SENET_layer to get new embedding matrix senet_embedding_matrix = SENET_layer(embedding_matrix, field_size, emb_size, pool_op=params['pool_op'], ratio=params['senet_ratio']) # combination layer & BI_interaction BI_org = Bilinear_layer(embedding_matrix, field_size, emb_size, type=params['model_type'], name='org') BI_senet = Bilinear_layer(senet_embedding_matrix, field_size, emb_size, type=params['model_type'], name='senet') combination_layer = tf.concat([BI_org, BI_senet], axis=1) # Deep part dense_output = stack_dense_layer(combination_layer, params['hidden_units'], params['dropout_rate'], params['batch_norm'], mode, add_summary=True) with tf.variable_scope('output'): y = dense_output + linear_output add_layer_summary('output', y) return y
def ffn(x, params, mode): """ feed forward after add & norm """ with tf.variable_scope('ffn', reuse=tf.AUTO_REUSE): d_model = x.shape.as_list()[-1] # emb_size y = tf.layers.dense(x, units=params['ffn_hidden'], activation='relu') add_layer_summary('ffn_hidden1', y) y = tf.layers.dense(y, units=d_model, activation=None) y = tf.layers.dropout(y, rate=params['dropout_rate'], training=(mode == tf.estimator.ModeKeys.TRAIN)) add_layer_summary('ffn_hidden2', y) y = add_and_norm_layer(x, y) return y
def model_fn_sparse(features, labels, mode, params): # hyper parameter data_params = params['data_params'] field_size = data_params['field_size'] feature_size = data_params['feature_size'] embedding_size = data_params['embedding_size'] # extract feature feat_ids = tf.reshape(features['feat_ids'], shape=[-1, field_size]) # batch * field_size feat_vals = tf.reshape(features['feat_vals'], shape=[-1, field_size]) # batch * field_size # extract embedding embedding_matrix = sparse_embedding(feature_size, embedding_size, field_size, feat_ids, feat_vals, add_summary=True) # linear output linear_output = sparse_linear(feature_size, feat_ids, feat_vals, add_summary=True) with tf.variable_scope('BI_Pooling'): sum_square = tf.pow(tf.reduce_sum(embedding_matrix, axis=1), 2) square_sum = tf.reduce_sum(tf.pow(embedding_matrix, 2), axis=1) dense = tf.subtract(sum_square, square_sum) add_layer_summary(dense.name, dense) # fully connected stacked dense layers dense = stack_dense_layer(dense, params['hidden_units'], dropout_rate=params['dropout_rate'], batch_norm=params['batch_norm'], mode=mode, add_summary=True) with tf.variable_scope('output'): y = linear_output + dense add_layer_summary('output', y) return y
def stack_dense_layer(dense, hidden_units, dropout_rate, batch_norm, mode, add_summary): with tf.variable_scope('Dense'): for i, unit in enumerate(hidden_units): dense = tf.layers.dense(dense, units = unit, activation = 'relu', name = 'dense{}'.format(i)) if batch_norm: dense = tf.layers.batch_normalization(dense, center = True, scale = True, trainable = True, training = (mode == tf.estimator.ModeKeys.TRAIN)) if dropout_rate > 0: dense = tf.layers.dropout(dense, rate = dropout_rate, training = (mode == tf.estimator.ModeKeys.TRAIN)) if add_summary: add_layer_summary(dense.name, dense) return dense
def sparse_linear(feature_size, feat_ids, feat_vals, add_summary): with tf.variable_scope('Linear_output'): weight = tf.get_variable( shape=[feature_size], initializer=tf.truncated_normal_initializer(), name='linear_weight' ) bias = tf.get_variable( shape=[1], initializer=tf.glorot_uniform_initializer(), name='linear_bias' ) linear_output = tf.nn.embedding_lookup( weight, feat_ids ) linear_output = tf.reduce_sum( tf.multiply( linear_output, feat_vals ), axis=1, keepdims=True ) linear_output = tf.add( linear_output, bias ) if add_summary: add_layer_summary('linear_output', linear_output) return linear_output
def Bilinear_layer(embedding_matrix, field_size, emb_size, type, name): # Bilinear_layer: combine inner and element-wise product interaction_list = [] with tf.variable_scope('BI_interaction_{}'.format(name)): if type == 'field_all': weight = tf.get_variable( shape=(emb_size, emb_size), initializer=tf.truncated_normal_initializer(), name='Bilinear_weight_{}'.format(name)) for i in range(field_size): if type == 'field_each': weight = tf.get_variable( shape=(emb_size, emb_size), initializer=tf.truncated_normal_initializer(), name='Bilinear_weight_{}_{}'.format(i, name)) for j in range(i + 1, field_size): if type == 'field_interaction': weight = tf.get_variable( shape=(emb_size, emb_size), initializer=tf.truncated_normal_initializer(), name='Bilinear_weight_{}_{}_{}'.format(i, j, name)) vi = tf.gather(embedding_matrix, indices=i, axis=1, batch_dims=0, name='v{}'.format(i)) # batch * emb_size vj = tf.gather(embedding_matrix, indices=j, axis=1, batch_dims=0, name='v{}'.format(j)) # batch * emb_size pij = tf.matmul(tf.multiply(vi, vj), weight) # bilinear : vi * wij \odot vj interaction_list.append(pij) combination = tf.stack( interaction_list, axis=1) # batch * emb_size * (Field_size * (Field_size-1)/2) combination = tf.reshape( combination, shape=[-1, int(emb_size * (field_size * (field_size - 1) / 2)) ]) # batch * ~ add_layer_summary('bilinear_output', combination) return combination
def build_model(self, features, labels, mode): """ Build model_fn for Quick Thought Input features: {tokens:, seq_len:} labels: {tokens:, seq_len:} Return tf.estimator.EstimatorSpec """ input_encode = self.input_encode(features) output_encode = self.output_encode(features, labels, mode) if mode == tf.estimator.ModeKeys.PREDICT: predictions = self.vectorize( [input_encode.state[0], output_encode.state[0]], features) return tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions) sim_score = tf.matmul(input_encode.state[0], output_encode.state[0], transpose_b=True) # [batch, batch] sim score add_layer_summary('sim_score', sim_score) loss = self.compute_loss(sim_score) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer( learning_rate=get_learning_rate(self.params)) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): if self.params['clip_gradient']: train_op = gradient_clipping(optimizer, loss, self.params['lower_gradient'], self.params['upper_gradient']) else: train_op = optimizer.minimize( loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
def cross_op(xk, x0, layer_size_prev, layer_size_curr, layer, emb_size, field_size): # Hamard product: ( batch * D * HK-1 * 1) * (batch * D * 1* H0) -> batch * D * HK-1 * H0 zk = tf.matmul(tf.expand_dims(tf.transpose(xk, perm=(0, 2, 1)), 3), tf.expand_dims(tf.transpose(x0, perm=(0, 2, 1)), 2)) zk = tf.reshape(zk, [-1, emb_size, field_size * layer_size_prev ]) # batch * D * HK-1 * H0 -> batch * D * (HK-1 * H0) add_layer_summary('zk_{}'.format(layer), zk) # Convolution with channel = HK: (batch * D * (HK-1*H0)) * ((HK-1*H0) * HK)-> batch * D * HK kernel = tf.get_variable(name='kernel{}'.format(layer), shape=(field_size * layer_size_prev, layer_size_curr)) xkk = tf.matmul(zk, kernel) xkk = tf.transpose(xkk, perm=[0, 2, 1]) # batch * HK * D add_layer_summary('Xk_{}'.format(layer), xkk) return xkk
def model_fn_sparse(features, labels, mode, params): # hyper parameter data_params = params['data_params'] field_size = data_params['field_size'] feature_size = data_params['feature_size'] embedding_size = data_params['embedding_size'] # extract feature feat_ids = tf.reshape(features['feat_ids'], shape=[-1, field_size]) # (batch, field_size) feat_vals = tf.reshape(features['feat_vals'], shape=[-1, field_size]) # (batch, field_size) # extract embedding with tf.variable_scope('extract_embedding'): embedding_matrix = sparse_embedding( feature_size, embedding_size, field_size, feat_ids, feat_vals, add_summary=True) # (batch, field_size, embedding_size) dense_input = tf.reshape(embedding_matrix, [-1, field_size * embedding_size ]) # (batch, field_size * embedding_size) # deep part dense = stack_dense_layer(dense_input, params['hidden_units'], params['dropout_rate'], params['batch_norm'], mode, add_summary=True) # cross part xl = cross_layer(dense_input, params['cross_layers']) with tf.variable_scope('stack'): x_stack = tf.concat([dense, xl], axis=1) with tf.variable_scope('output'): y = tf.layers.dense(x_stack, units=1) add_layer_summary('output', y) return y
def model_fn(features, labels, mode, params): dense_feature = build_features() dense = tf.feature_column.input_layer(features, dense_feature) # stacked residual layer with tf.variable_scope('Residual_layers'): for i, unit in enumerate(params['hidden_units']): dense = residual_layer(dense, unit, dropout_rate=params['dropout_rate'], batch_norm=params['batch_norm'], mode=mode) add_layer_summary('residual_layer{}'.format(i), dense) with tf.variable_scope('output'): y = tf.layers.dense(dense, units=1) add_layer_summary('output', y) return y
def layer_norm(x): """ layer normalization from Jimmy, apply normalization along feature and apply transformation """ with tf.variable_scope('layer_normalization', reuse=tf.AUTO_REUSE): d_model = x.shape.as_list()[-1] epsilon = tf.constant(np.finfo(np.float32).eps) mean, variance = tf.nn.moments(x, axes=-1, keep_dims=True) x = (x - mean) / ((variance + epsilon)**0.5) # do layer norm add_layer_summary('norm', x) kernel = tf.get_variable('norm_kernel', shape=(d_model, ), initializer=tf.ones_initializer()) bias = tf.get_variable('norm_bias', shape=(d_model, ), initializer=tf.zeros_initializer()) x = tf.multiply(kernel, x) + bias add_layer_summary('norm_transform', x) return x
def model_fn_dense(features, labels, mode, params): dense_feature, sparse_feature = build_features() dense = tf.feature_column.input_layer(features, dense_feature) sparse = tf.feature_column.input_layer(features, sparse_feature) field_size = len(dense_feature) embedding_size = dense_feature[0].variable_shape.as_list()[-1] embedding_matrix = tf.reshape( dense, [-1, field_size, embedding_size]) # batch * field_size *emb_size with tf.variable_scope('Linear_output'): linear_output = tf.layers.dense(sparse, units=1) add_layer_summary('linear_output', linear_output) with tf.variable_scope('BI_Pooling'): sum_square = tf.pow(tf.reduce_sum(embedding_matrix, axis=1), 2) square_sum = tf.reduce_sum(tf.pow(embedding_matrix, 2), axis=1) dense = tf.subtract(sum_square, square_sum) add_layer_summary(dense.name, dense) dense = stack_dense_layer(dense, params['hidden_units'], dropout_rate=params['dropout_rate'], batch_norm=params['batch_norm'], mode=mode, add_summary=True) with tf.variable_scope('output'): y = linear_output + dense add_layer_summary('output', y) return y
def model_fn_dense(features, labels, mode, params): dense_feature, sparse_feature = build_features() dense_input = tf.feature_column.input_layer(features, dense_feature) sparse_input = tf.feature_column.input_layer(features, sparse_feature) # Linear part with tf.variable_scope('Linear_component'): linear_output = tf.layers.dense(sparse_input, units=1) add_layer_summary('linear_output', linear_output) # Deep part dense_output = stack_dense_layer(dense_input, params['hidden_units'], params['dropout_rate'], params['batch_norm'], mode, add_summary=True) # CIN part emb_size = dense_feature[0].variable_shape.as_list()[-1] field_size = len(dense_feature) embedding_matrix = tf.reshape( dense_input, [-1, field_size, emb_size]) # batch * field_size * emb_size add_layer_summary('embedding_matrix', embedding_matrix) cin_output = cin_layer(embedding_matrix, params['cin_layer_size'], emb_size, field_size) with tf.variable_scope('output'): y = tf.concat([dense_output, cin_output, linear_output], axis=1) y = tf.layers.dense(y, units=1) add_layer_summary('output', y) return y