def gru_cell(prev_hidden: tf.Tensor, input: tf.Tensor) -> tf.Tensor: """Compute a single step of a GRU (following the PyTorch parameterization). See PyTorch GRUCell, https://pytorch.org/docs/stable/generated/torch.nn.GRUCell.html for the definition of this operation & trainable variables. Arguments: prev_hidden -- shape (batch_size x hidden_size), the previous GRU state, e.g. returned by gru_cell input -- shape (batch_size x input_size) Returns: tensor of shape (batch_size x hidden_size), a new GRU state """ batch_size, hidden_size = assert_shape(prev_hidden, (None, None)) _, input_size = assert_shape(input, (batch_size, None)) dtype = prev_hidden.dtype weight_i = tf.get_variable( "weight_i", (3, input_size, hidden_size), dtype=dtype, initializer=tf.glorot_normal_initializer(), ) bias_i = tf.get_variable("bias_i", (3, hidden_size), dtype=dtype, initializer=tf.zeros_initializer()) weight_h = tf.get_variable( "weight_h", (3, hidden_size, hidden_size), dtype=dtype, initializer=tf.glorot_normal_initializer(), ) bias_h = tf.get_variable("bias_h", (3, hidden_size), dtype=dtype, initializer=tf.zeros_initializer()) reset_i, update_i, candidate_i = tf.unstack(input @ weight_i + tf.expand_dims(bias_i, 1)) reset_h, update_h, candidate_h = tf.unstack(prev_hidden @ weight_h + tf.expand_dims(bias_h, 1)) reset = tf.sigmoid(reset_i + reset_h) update = tf.sigmoid(update_i + update_h) candidate = tf.tanh(candidate_i + reset * candidate_h) return (1 - update) * candidate + update * prev_hidden
def _convolutional_feature_extractor(self, stft, conv_layer_dropout): """ THE FUNCTION BUILDS THE END-TO-END FEATURE EXTRACTION COMPONENT CONSISTING OF 2 CONVOLUTIONAL LAYERS COMBINED WITH 2 MAX POOLING LAYERS -Arguments: stft: the framed melspectrograms of the audio files -Returns: conv_out: the features "extracted" after filtering the melspectogram through the convolutional layers """ self.init = tf.glorot_normal_initializer() with tf.variable_scope("Convbb", reuse=tf.AUTO_REUSE, initializer=self.init): stft = batch_normalization(stft) conv1 = self.conv_layer(input_data=tf.expand_dims(stft, axis=3), filter_size=8, channels_in=1, channels_out=32, strides=[1, 2, 2, 1], conv_layer_dropout=conv_layer_dropout, name="conv1") conv2 = tf.nn.max_pool(conv1, [1, 2, 2, 1], [1, 2, 2, 1], padding="SAME") conv2 = batch_normalization(conv2) conv3 = self.conv_layer(input_data=conv2, filter_size=4, channels_in=32, channels_out=16, strides=[1, 2, 2, 1], conv_layer_dropout=conv_layer_dropout, name="conv3") conv3 = tf.nn.max_pool(conv3, [1, 2, 2, 1], [1, 2, 2, 1], padding="SAME") conv3 = batch_normalization(conv3) conv_out = tf.reshape(conv3, (-1, 256)) return conv_out
def __init__(self, name, in_channels, num_layers, growth_rate, dropout_rate, bottleneck, build_method=Weights.impl.sandbox, ranks=None, kernel_initializer=tf.glorot_normal_initializer(), bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None): """ :param name: Variable scope :param N: How many layers """ super().__init__() self.name = name self.in_channels = in_channels self.num_layers = num_layers self.growth_rate = growth_rate self.build_method = build_method self.dropout_rate = dropout_rate self.bottleneck = bottleneck self.ranks = ranks self.kernel_initializer = kernel_initializer self.bias_initializer = bias_initializer self.kernel_regularizer = kernel_regularizer self.bias_regularizer = bias_regularizer
def _get_variable(self, name, shape, initializer=None): if initializer is None: initializer = getattr( self, '_weight_initializer', tf.glorot_normal_initializer()) else: assert callable(initializer) return tf.get_variable( name, shape, dtype=hub.dtype, initializer=initializer)
def get_initializer(initializer, dtype): if initializer == 'zeros': return tf.zeros_initializer(dtype=dtype) elif initializer == 'ones': return tf.ones_initializer(dtype=dtype) elif initializer == 'vs': return tf.variance_scaling_initializer(dtype=dtype) elif initializer == 'xavier': return tf.glorot_normal_initializer(dtype=dtype) elif initializer == 'he': return tf.variance_scaling_initializer(dtype=dtype) else: raise NotImplementedError
def feedforward_network(inputStates, inputSize, outputSize, num_fc_layers, depth_fc_layers, tf_datatype, scope): with tf.variable_scope(str(scope)): #concat K entries together [bs x K x sa] --> [bs x ksa] inputState = tf.layers.flatten(inputStates) #vars intermediate_size = depth_fc_layers reuse = False initializer = tf.glorot_normal_initializer( seed=None, dtype=tf_datatype) fc = tf.layers.dense # make hidden layers for i in range(num_fc_layers): if i==0: fc_i = fc( inputState, units=intermediate_size, activation=None, kernel_initializer=initializer, bias_initializer=initializer, reuse=reuse, trainable=True) else: fc_i = fc( h_i, units=intermediate_size, activation=None, kernel_initializer=initializer, bias_initializer=initializer, reuse=reuse, trainable=True) h_i = tf.nn.relu(fc_i) # make output layer z = fc( h_i, units=outputSize, activation=None, kernel_initializer=initializer, bias_initializer=initializer, reuse=reuse, trainable=True) return z
def model_fn(model, features, labels, mode): global_step = tf.train.get_or_create_global_step() xavier_initializer = tf.glorot_normal_initializer() fc1_size = 128 with tf.variable_scope('leader'): w1f = tf.get_variable('w1f', shape=[fc1_size, 1], dtype=tf.float32, initializer=tf.random_uniform_initializer( -0.01, 0.01)) b1f = tf.get_variable('b1f', shape=[1], dtype=tf.float32, initializer=tf.zeros_initializer()) if mode == tf.estimator.ModeKeys.TRAIN: embedding = model.recv('embedding', tf.float32, require_grad=True) else: embedding = features['embedding'] logits = tf.nn.bias_add(tf.matmul(embedding, w1f), b1f) if mode == tf.estimator.ModeKeys.TRAIN: y = tf.dtypes.cast(labels['y'], tf.float32) loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits) loss = tf.math.reduce_mean(loss) # cala auc pred = tf.math.sigmoid(logits) _, auc = tf.metrics.auc(labels=y, predictions=pred) logging_hook = tf.train.LoggingTensorHook({ "loss": loss, "auc": auc }, every_n_iter=10) optimizer = tf.train.GradientDescentOptimizer(0.1) train_op = model.minimize(optimizer, loss, global_step=global_step) return model.make_spec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook]) if mode == tf.estimator.ModeKeys.PREDICT: return model.make_spec(mode, predictions=logits)
def __init__(self, shape, build_method=Weights.impl.sandbox, use_bias=True, kernel_initializer=tf.glorot_normal_initializer(), bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None): super().__init__() self._shape = shape self._build_method = build_method self._use_bias = use_bias self.kernel_initializer = kernel_initializer self.bias_initializer = bias_initializer self.kernel_regularizer = kernel_regularizer self.bias_regularizer = bias_regularizer
def _encoders(self): initializer = tf.glorot_normal_initializer() entity_encoder = gl.encoders.LookupEncoder(self.entity_num, self.hidden_dim, str2hash=self.s2h, ps_hosts=self.ps_hosts, init=initializer, name='entity_encoder') relation_encoder = gl.encoders.LookupEncoder(self.relation_num, self.hidden_dim, str2hash=self.s2h, ps_hosts=self.ps_hosts, init=initializer, use_edge=True, name='relation_encoder') return { "src": entity_encoder, "edge": relation_encoder, "dst": entity_encoder }
def linear(input: tf.Tensor, n_output: int, use_bias: bool = True) -> tf.Tensor: """A standard linear layer `W x + b`.""" weight = tf.get_variable( "weight", dtype=input.dtype, shape=(input.shape[-1], n_output), initializer=tf.glorot_normal_initializer(), ) output = input @ weight if use_bias: bias = tf.get_variable( "bias", dtype=input.dtype, shape=(n_output, ), initializer=tf.zeros_initializer(), ) output += bias return output
def __init__( self, shape, strides=(1, 1), use_bias=True, padding="SAME", # partitions=[0.8, 0.8], partitions=None, kernel_initializer=tf.glorot_normal_initializer(), bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, ranks=None): """ Custom implementation for the depthwise separable layers The pointwise convolution is separated across the input channel dimensions Whereas the depthwise + standard convolution """ super().__init__() px = strides[0] py = strides[1] self._strides = [1, px, py, 1] # The two partitions self.partitions = partitions self._shape = shape self._padding = padding self._use_bias = use_bias self.kernel_initializer = kernel_initializer self.bias_initializer = bias_initializer self.kernel_regularizer = kernel_regularizer self.bias_regularizer = bias_regularizer # Rank for the core tensor G self.ranks = ranks
def _build_bilinear_layers(net, params): feat_emb = tf.reshape(net, (-1, deep_fields_size, org_emb_size)) cnt = 0 element_wise_product_list = [] for i in range(0, deep_fields_size): for j in range(i + 1, deep_fields_size): with tf.variable_scope('weight_', reuse=tf.AUTO_REUSE): weight = tf.get_variable( name='weight_' + str(cnt), shape=[org_emb_size, org_emb_size], initializer=tf.glorot_normal_initializer( seed=random.randint(0, 1024)), dtype=tf.float32) element_wise_product_list.append( tf.multiply(tf.matmul(feat_emb[:, i, :], weight), feat_emb[:, j, :])) cnt += 1 element_wise_product = tf.stack(element_wise_product_list) element_wise_product = tf.transpose(element_wise_product, perm=[1, 0, 2], name="element_wise_product") bilinear_output = tf.layers.flatten(element_wise_product) return bilinear_output
def test_group_lasso_conv3d(self): shape = [3, 3, 3] video = tf.zeros([2, 3, 3, 3, 1]) net = slim.conv3d(video, 5, shape, padding='VALID', weights_initializer=tf.glorot_normal_initializer(), scope='vconv1') conv3d_op = tf.get_default_graph().get_operation_by_name( 'vconv1/Conv3D') conv3d_weights = conv3d_op.inputs[1] threshold = 0.09 flop_reg = flop_regularizer.GroupLassoFlopsRegularizer( [net.op], threshold=threshold) norm = tf.sqrt(tf.reduce_mean(tf.square(conv3d_weights), [0, 1, 2, 3])) alive = tf.reduce_sum(tf.cast(norm > threshold, tf.float32)) with self.session(): flop_coeff = 2 * shape[0] * shape[1] * shape[2] tf.compat.v1.global_variables_initializer().run() self.assertAllClose(flop_reg.get_cost(), flop_coeff * alive) self.assertAllClose(flop_reg.get_regularization_term(), flop_coeff * tf.reduce_sum(norm))
def masked_dense( inputs, units, num_blocks=None, exclusive=False, kernel_initializer=None, reuse=None, name=None, *args, # pylint: disable=keyword-arg-before-vararg **kwargs): """A autoregressively masked dense layer. Analogous to `tf.layers.dense`. See [Germain et al. (2015)][1] for detailed explanation. Arguments: inputs: Tensor input. units: Python `int` scalar representing the dimensionality of the output space. num_blocks: Python `int` scalar representing the number of blocks for the MADE masks. exclusive: Python `bool` scalar representing whether to zero the diagonal of the mask, used for the first layer of a MADE. kernel_initializer: Initializer function for the weight matrix. If `None` (default), weights are initialized using the `tf.glorot_random_initializer`. reuse: Python `bool` scalar representing whether to reuse the weights of a previous layer by the same name. name: Python `str` used to describe ops managed by this function. *args: `tf.layers.dense` arguments. **kwargs: `tf.layers.dense` keyword arguments. Returns: Output tensor. Raises: NotImplementedError: if rightmost dimension of `inputs` is unknown prior to graph execution. #### References [1]: Mathieu Germain, Karol Gregor, Iain Murray, and Hugo Larochelle. MADE: Masked Autoencoder for Distribution Estimation. In _International Conference on Machine Learning_, 2015. https://arxiv.org/abs/1502.03509 """ # TODO(b/67594795): Better support of dynamic shape. input_depth = tf.compat.dimension_value( tensorshape_util.with_rank_at_least(inputs.shape, 1)[-1]) if input_depth is None: raise NotImplementedError( 'Rightmost dimension must be known prior to graph execution.') mask = _gen_mask(num_blocks, input_depth, units, MASK_EXCLUSIVE if exclusive else MASK_INCLUSIVE).T if kernel_initializer is None: kernel_initializer = tf1.glorot_normal_initializer() def masked_initializer(shape, dtype=None, partition_info=None): return mask * kernel_initializer(shape, dtype, partition_info) with tf.name_scope(name or 'masked_dense'): layer = tf1.layers.Dense( units, kernel_initializer=masked_initializer, kernel_constraint=lambda x: mask * x, name=name, dtype=dtype_util.base_dtype(inputs.dtype), _scope=name, _reuse=reuse, *args, # pylint: disable=keyword-arg-before-vararg **kwargs) return layer.apply(inputs)
def generator(inputs, is_train=True, reuse=False): image_size = 128 #s32 = image_size // 32 gf_dim = 64 # Dimension of gen filters in first conv layer. [64] c_dim = 1 # n_color 1 w_init = tf.glorot_normal_initializer() gamma_init = tf.random_normal_initializer(1., 0.02) with tf.name_scope("GENERATOR"): with tf.variable_scope("generator", reuse=reuse): with tf.name_scope("net_in"): net_in = InputLayer(inputs, name='g/in') ############################################################################# with tf.name_scope("layer0"): net_h0 = DenseLayer(net_in, n_units=(gf_dim * 32 * 4 * 4), W_init=w_init, act=tf.identity, name='g/h0/lin') net_h0 = ReshapeLayer(net_h0, shape=[-1, 4, 4, gf_dim * 32], name='g/h0/reshape') net_h0 = BatchNormLayer(net_h0, decay=0.9, act=tf.nn.relu, is_train=is_train, gamma_init=gamma_init, name='g/h0/batch_norm') with tf.name_scope("layer1"): net_h1 = DeConv2d(net_h0, gf_dim * 8, (5, 5), strides=(2, 2), padding='SAME', act=None, W_init=w_init, name='g/h1/decon2d') net_h1 = BatchNormLayer(net_h1, decay=0.9, act=tf.nn.relu, is_train=is_train, gamma_init=gamma_init, name='g/h1/batch_norm') with tf.name_scope("layer2"): net_h2 = DeConv2d(net_h1, gf_dim * 4, (5, 5), strides=(2, 2), padding='SAME', act=None, W_init=w_init, name='g/h2/decon2d') net_h2 = BatchNormLayer(net_h2, decay=0.9, act=tf.nn.relu, is_train=is_train, gamma_init=gamma_init, name='g/h2/batch_norm') with tf.name_scope("layer3"): net_h3 = DeConv2d(net_h2, gf_dim * 2, (5, 5), strides=(2, 2), padding='SAME', act=None, W_init=w_init, name='g/h3/decon2d') net_h3 = BatchNormLayer(net_h3, decay=0.9, act=tf.nn.relu, is_train=is_train, gamma_init=gamma_init, name='g/h3/batch_norm') with tf.name_scope("layer4"): net_h4 = DeConv2d(net_h3, gf_dim, (5, 5), strides=(2, 2), padding='SAME', act=None, W_init=w_init, name='g/h4/decon2d') net_h4 = BatchNormLayer(net_h4, decay=0.9, act=tf.nn.relu, is_train=is_train, gamma_init=gamma_init, name='g/h4/batch_norm') with tf.name_scope("layer5"): net_h5 = DeConv2d(net_h4, c_dim, (5, 5), strides=(2, 2), padding='SAME', act=None, W_init=w_init, name='g/h5/decon2d') #net_h5.outputs = tf.nn.tanh(net_h5.outputs) net_h5.outputs = tf.nn.tanh(net_h5.outputs) return net_h5
def discriminator2(inputs, is_train=True, reuse=False): df_dim = 32 # Dimension of discrim filters in first conv layer. [64] w_init = tf.glorot_normal_initializer() gamma_init = tf.random_normal_initializer(1., 0.02) lrelu = lambda x: tf.nn.leaky_relu(x, 0.2) with tf.name_scope("DISCRIMINATOR2"): with tf.variable_scope("discriminator2", reuse=reuse): with tf.name_scope("net_in"): net_in = InputLayer(inputs, name='d2/in') with tf.name_scope("layer0"): net_h0 = Conv2d(net_in, df_dim, (3, 3), (3, 3), act=lrelu, padding='SAME', W_init=w_init, name='d2/h0/conv2d') with tf.name_scope("layer1"): net_h1 = Conv2d(net_h0, df_dim * 2, (3, 3), (3, 3), act=None, padding='SAME', W_init=w_init, name='d2/h1/conv2d') net_h1 = BatchNormLayer(net_h1, decay=0.9, act=lrelu, is_train=is_train, gamma_init=gamma_init, name='d2/h1/batch_norm') with tf.name_scope("layer2"): net_h2 = Conv2d(net_h1, df_dim * 4, (3, 3), (3, 3), act=None, padding='SAME', W_init=w_init, name='d2/h2/conv2d') net_h2 = BatchNormLayer(net_h2, decay=0.9, act=lrelu, is_train=is_train, gamma_init=gamma_init, name='d2/h2/batch_norm') with tf.name_scope("layer3"): net_h3 = Conv2d(net_h2, df_dim * 8, (3, 3), (3, 3), act=None, padding='SAME', W_init=w_init, name='d2/h3/conv2d') net_h3 = BatchNormLayer(net_h3, decay=0.9, act=lrelu, is_train=is_train, gamma_init=gamma_init, name='d2/h3/batch_norm') with tf.name_scope("layer4"): net_h4 = FlattenLayer(net_h3, name='d2/h4/flatten') net_h4 = DenseLayer(net_h4, n_units=df_dim * 8, act=tf.identity, W_init=w_init, name='d2/h4/lin_sigmoid') with tf.name_scope("layer5"): net_h5 = FlattenLayer(net_h4, name='d2/h5/flatten') net_h5 = DenseLayer(net_h5, n_units=df_dim * 8, act=tf.identity, W_init=w_init, name='d2/h5/lin_sigmoid') #net_h6 = FlattenLayer(net_h5, name='d/h6/flatten') with tf.name_scope("layer6"): net_h6 = DenseLayer(net_h5, n_units=2, act=tf.identity, W_init=w_init, name='d2/h6/lin_sigmoid') logits2 = net_h6.outputs net_h6.outputs = tf.nn.softplus(net_h6.outputs) return net_h6, logits2
def construct_network(self): self.word_ids = tf.placeholder(tf.int32, [None, None], name="word_ids") self.char_ids = tf.placeholder(tf.int32, [None, None, None], name="char_ids") self.sentence_lengths = tf.placeholder(tf.int32, [None], name="sentence_lengths") self.word_lengths = tf.placeholder(tf.int32, [None, None], name="word_lengths") self.sentence_labels = tf.placeholder(tf.float32, [None], name="sentence_labels") self.word_labels = tf.placeholder(tf.float32, [None, None], name="word_labels") self.word_objective_weights = tf.placeholder( tf.float32, [None, None], name="word_objective_weights") self.sentence_objective_weights = tf.placeholder( tf.float32, [None], name="sentence_objective_weights") self.learningrate = tf.placeholder(tf.float32, name="learningrate") self.is_training = tf.placeholder(tf.int32, name="is_training") self.loss = 0.0 input_tensor = None input_vector_size = 0 self.initializer = None if self.config["initializer"] == "normal": self.initializer = tf.random_normal_initializer(mean=0.0, stddev=0.1) elif self.config["initializer"] == "glorot": self.initializer = tf.glorot_uniform_initializer() elif self.config["initializer"] == "xavier": self.initializer = tf.glorot_normal_initializer() zeros_initializer = tf.zeros_initializer() self.word_embeddings = tf.get_variable( "word_embeddings", shape=[len(self.word2id), self.config["word_embedding_size"]], initializer=(zeros_initializer if self.config["emb_initial_zero"] == True else self.initializer), trainable=(True if self.config["train_embeddings"] == True else False)) input_tensor = tf.nn.embedding_lookup(self.word_embeddings, self.word_ids) input_vector_size = self.config["word_embedding_size"] if self.config["char_embedding_size"] > 0 and self.config[ "char_recurrent_size"] > 0: with tf.variable_scope("chars"), tf.control_dependencies([ tf.assert_equal(tf.shape(self.char_ids)[2], tf.reduce_max(self.word_lengths), message="Char dimensions don't match") ]): self.char_embeddings = tf.get_variable( "char_embeddings", shape=[ len(self.char2id), self.config["char_embedding_size"] ], initializer=self.initializer, trainable=True) char_input_tensor = tf.nn.embedding_lookup( self.char_embeddings, self.char_ids) s = tf.shape(char_input_tensor) char_input_tensor = tf.reshape( char_input_tensor, shape=[ s[0] * s[1], s[2], self.config["char_embedding_size"] ]) _word_lengths = tf.reshape(self.word_lengths, shape=[s[0] * s[1]]) char_lstm_cell_fw = tf.nn.rnn_cell.LSTMCell( self.config["char_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) char_lstm_cell_bw = tf.nn.rnn_cell.LSTMCell( self.config["char_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) char_lstm_outputs = tf.nn.bidirectional_dynamic_rnn( char_lstm_cell_fw, char_lstm_cell_bw, char_input_tensor, sequence_length=_word_lengths, dtype=tf.float32, time_major=False) _, ((_, char_output_fw), (_, char_output_bw)) = char_lstm_outputs char_output_tensor = tf.concat( [char_output_fw, char_output_bw], axis=-1) char_output_tensor = tf.reshape( char_output_tensor, shape=[s[0], s[1], 2 * self.config["char_recurrent_size"]]) char_output_vector_size = 2 * self.config["char_recurrent_size"] if self.config["lmcost_char_gamma"] > 0.0: self.loss += self.config[ "lmcost_char_gamma"] * self.construct_lmcost( char_output_tensor, char_output_tensor, self.sentence_lengths, self.word_ids, "separate", "lmcost_char_separate") if self.config["lmcost_joint_char_gamma"] > 0.0: self.loss += self.config[ "lmcost_joint_char_gamma"] * self.construct_lmcost( char_output_tensor, char_output_tensor, self.sentence_lengths, self.word_ids, "joint", "lmcost_char_joint") if self.config["char_hidden_layer_size"] > 0: char_output_tensor = tf.layers.dense( char_output_tensor, self.config["char_hidden_layer_size"], activation=tf.tanh, kernel_initializer=self.initializer) char_output_vector_size = self.config[ "char_hidden_layer_size"] if self.config["char_integration_method"] == "concat": input_tensor = tf.concat( [input_tensor, char_output_tensor], axis=-1) input_vector_size += char_output_vector_size elif self.config["char_integration_method"] == "none": input_tensor = input_tensor else: raise ValueError("Unknown char integration method") self.word_representations = input_tensor dropout_input = self.config["dropout_input"] * tf.cast( self.is_training, tf.float32) + ( 1.0 - tf.cast(self.is_training, tf.float32)) input_tensor = tf.nn.dropout(input_tensor, dropout_input, name="dropout_word") word_lstm_cell_fw = tf.nn.rnn_cell.LSTMCell( self.config["word_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) word_lstm_cell_bw = tf.nn.rnn_cell.LSTMCell( self.config["word_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) with tf.control_dependencies([ tf.assert_equal(tf.shape(self.word_ids)[1], tf.reduce_max(self.sentence_lengths), message="Sentence dimensions don't match") ]): (lstm_outputs_fw, lstm_outputs_bw), ((_, lstm_output_fw), ( _, lstm_output_bw)) = tf.nn.bidirectional_dynamic_rnn( word_lstm_cell_fw, word_lstm_cell_bw, input_tensor, sequence_length=self.sentence_lengths, dtype=tf.float32, time_major=False) dropout_word_lstm = self.config["dropout_word_lstm"] * tf.cast( self.is_training, tf.float32) + ( 1.0 - tf.cast(self.is_training, tf.float32)) lstm_outputs_fw = tf.nn.dropout( lstm_outputs_fw, dropout_word_lstm, noise_shape=tf.convert_to_tensor([ tf.shape(self.word_ids)[0], 1, self.config["word_recurrent_size"] ], dtype=tf.int32)) lstm_outputs_bw = tf.nn.dropout( lstm_outputs_bw, dropout_word_lstm, noise_shape=tf.convert_to_tensor([ tf.shape(self.word_ids)[0], 1, self.config["word_recurrent_size"] ], dtype=tf.int32)) lstm_outputs = tf.concat([lstm_outputs_fw, lstm_outputs_bw], -1) if self.config["whidden_layer_size"] > 0: lstm_outputs = tf.layers.dense(lstm_outputs, self.config["whidden_layer_size"], activation=tf.tanh, kernel_initializer=self.initializer) self.lstm_outputs = lstm_outputs lstm_output = tf.concat([lstm_output_fw, lstm_output_bw], -1) lstm_output = tf.nn.dropout(lstm_output, dropout_word_lstm) if self.config["sentence_composition"] == "last": processed_tensor = lstm_output self.attention_weights_unnormalised = tf.zeros_like( self.word_ids, dtype=tf.float32) elif self.config["sentence_composition"] == "attention": with tf.variable_scope("attention"): attention_evidence = tf.layers.dense( lstm_outputs, self.config["attention_evidence_size"], activation=tf.tanh, kernel_initializer=self.initializer) attention_weights = tf.layers.dense( attention_evidence, 1, activation=None, kernel_initializer=self.initializer) attention_weights = tf.reshape(attention_weights, shape=tf.shape(self.word_ids)) if self.config["attention_activation"] == "sharp": attention_weights = tf.exp(attention_weights) elif self.config["attention_activation"] == "soft": attention_weights = tf.sigmoid(attention_weights) elif self.config["attention_activation"] == "linear": pass else: raise ValueError("Unknown activation for attention: " + str(self.config["attention_activation"])) word_objective_loss = tf.square(attention_weights - self.word_labels) word_objective_loss = tf.where( tf.sequence_mask(self.sentence_lengths), word_objective_loss, tf.zeros_like(word_objective_loss)) self.loss += self.config[ "word_objective_weight"] * tf.reduce_sum( self.word_objective_weights * word_objective_loss) self.attention_weights_unnormalised = attention_weights attention_weights = tf.where( tf.sequence_mask(self.sentence_lengths), attention_weights, tf.zeros_like(attention_weights)) attention_weights = attention_weights / tf.reduce_sum( attention_weights, 1, keep_dims=True) processed_tensor = tf.reduce_sum( lstm_outputs * attention_weights[:, :, numpy.newaxis], 1) if self.config["hidden_layer_size"] > 0: processed_tensor = tf.layers.dense( processed_tensor, self.config["hidden_layer_size"], activation=tf.tanh, kernel_initializer=self.initializer) self.sentence_scores = tf.layers.dense( processed_tensor, 1, activation=tf.sigmoid, kernel_initializer=self.initializer, name="output_ff") self.sentence_scores = tf.reshape( self.sentence_scores, shape=[tf.shape(processed_tensor)[0]]) self.loss += self.config["sentence_objective_weight"] * tf.reduce_sum( self.sentence_objective_weights * tf.square(self.sentence_scores - self.sentence_labels)) if self.config["attention_objective_weight"] > 0.0: self.loss += self.config["attention_objective_weight"] * \ (tf.reduce_sum( self.sentence_objective_weights *tf.square( tf.reduce_max( tf.where( tf.sequence_mask(self.sentence_lengths), self.attention_weights_unnormalised, tf.zeros_like(self.attention_weights_unnormalised) - 1e6), axis=-1) - self.sentence_labels)) + tf.reduce_sum( self.sentence_objective_weights * tf.square( tf.reduce_min( tf.where( tf.sequence_mask(self.sentence_lengths), self.attention_weights_unnormalised, tf.zeros_like(self.attention_weights_unnormalised) + 1e6), axis=-1) - 0.0))) self.token_scores = [ tf.where(tf.sequence_mask(self.sentence_lengths), self.attention_weights_unnormalised, tf.zeros_like(self.attention_weights_unnormalised) - 1e6) ] if self.config["lmcost_lstm_gamma"] > 0.0: self.loss += self.config[ "lmcost_lstm_gamma"] * self.construct_lmcost( lstm_outputs_fw, lstm_outputs_bw, self.sentence_lengths, self.word_ids, "separate", "lmcost_lstm_separate") if self.config["lmcost_joint_lstm_gamma"] > 0.0: self.loss += self.config[ "lmcost_joint_lstm_gamma"] * self.construct_lmcost( lstm_outputs_fw, lstm_outputs_bw, self.sentence_lengths, self.word_ids, "joint", "lmcost_lstm_joint") self.train_op = self.construct_optimizer(self.config["opt_strategy"], self.loss, self.learningrate, self.config["clip"])
def model_fn(model, features, labels, mode): def sum_pooling(embeddings, slots): slot_embeddings = [] for slot in slots: slot_embeddings.append(embeddings[_SLOT_2_IDX[slot]]) if len(slot_embeddings) == 1: return slot_embeddings[0] return tf.add_n(slot_embeddings) global_step = tf.train.get_or_create_global_step() num_slot, embed_size = len(_SLOT_2_BUCKET), 8 xavier_initializer = tf.glorot_normal_initializer() flt.feature.FeatureSlot.set_default_bias_initializer( tf.zeros_initializer()) flt.feature.FeatureSlot.set_default_vec_initializer( tf.random_uniform_initializer(-0.0078125, 0.0078125)) flt.feature.FeatureSlot.set_default_bias_optimizer( tf.train.FtrlOptimizer(learning_rate=0.01)) flt.feature.FeatureSlot.set_default_vec_optimizer( tf.train.AdagradOptimizer(learning_rate=0.01)) # deal with input cols categorical_embed = [] num_slot, embed_dim = len(_SLOT_2_BUCKET), 8 with tf.variable_scope("follower"): for slot, bucket_size in _SLOT_2_BUCKET: fs = model.add_feature_slot(slot, bucket_size) fc = model.add_feature_column(fs) categorical_embed.append(fc.add_vector(embed_dim)) # concate all embeddings slot_embeddings = categorical_embed concat_embedding = tf.concat(slot_embeddings, axis=1) output_size = len(slot_embeddings) * embed_dim model.freeze_slots(features) with tf.variable_scope("follower"): fc1_size, fc2_size, fc3_size = 512, 256, 128 w1 = tf.get_variable('w1', shape=[output_size, fc1_size], dtype=tf.float32, initializer=xavier_initializer) b1 = tf.get_variable('b1', shape=[fc1_size], dtype=tf.float32, initializer=tf.zeros_initializer()) w2 = tf.get_variable('w2', shape=[fc1_size, fc2_size], dtype=tf.float32, initializer=xavier_initializer) b2 = tf.get_variable('b2', shape=[fc2_size], dtype=tf.float32, initializer=tf.zeros_initializer()) w3 = tf.get_variable('w3', shape=[fc2_size, fc3_size], dtype=tf.float32, initializer=xavier_initializer) b3 = tf.get_variable('b3', shape=[fc3_size], dtype=tf.float32, initializer=tf.zeros_initializer()) act1_l = tf.nn.relu(tf.nn.bias_add(tf.matmul(concat_embedding, w1), b1)) act1_l = tf.layers.batch_normalization(act1_l, training=True) act2_l = tf.nn.relu(tf.nn.bias_add(tf.matmul(act1_l, w2), b2)) act2_l = tf.layers.batch_normalization(act2_l, training=True) embedding = tf.nn.relu(tf.nn.bias_add(tf.matmul(act2_l, w3), b3)) embedding = tf.layers.batch_normalization(embedding, training=True) if mode == tf.estimator.ModeKeys.TRAIN: embedding_grad = model.send('embedding', embedding, require_grad=True) optimizer = tf.train.GradientDescentOptimizer(0.1) train_op = model.minimize(optimizer, embedding, grad_loss=embedding_grad, global_step=global_step) return model.make_spec(mode, loss=tf.math.reduce_mean(embedding), train_op=train_op) elif mode == tf.estimator.ModeKeys.PREDICT: return model.make_spec(mode, predictions={'embedding': embedding})