def cross_attention_sublayer(self, queries: tf.Tensor) -> tf.Tensor: assert self.cross_attention_sublayer is not None assert self.n_cross_att_heads is not None assert self.input_for_cross_attention is not None encoder_att_states = get_attention_states( self.input_for_cross_attention) encoder_att_mask = get_attention_mask(self.input_for_cross_attention) # Layer normalization normalized_queries = layer_norm(queries) encoder_context, _ = attention( queries=normalized_queries, keys=encoder_att_states, values=encoder_att_states, keys_mask=encoder_att_mask, num_heads=self.n_cross_att_heads, dropout_callback=lambda x: dropout( x, self.attention_dropout_keep_prob, self.train_mode), use_bias=self.use_att_transform_bias) # Apply dropout encoder_context = dropout( encoder_context, self.dropout_keep_prob, self.train_mode) # Add residual connections return encoder_context + queries
def self_attention_sublayer( self, prev_layer: TransformerLayer) -> tf.Tensor: """Create the decoder self-attention sublayer with output mask.""" # Layer normalization normalized_states = layer_norm(prev_layer.temporal_states) # Run self-attention # TODO handle attention histories self_context, _ = attention( queries=normalized_states, keys=normalized_states, values=normalized_states, keys_mask=prev_layer.temporal_mask, num_heads=self.n_heads_self, masked=True, dropout_callback=lambda x: dropout( x, self.self_att_dropout_keep_prob, self.train_mode), use_bias=self.use_att_transform_bias) # Apply dropout self_context = dropout( self_context, self.dropout_keep_prob, self.train_mode) # Add residual connections return self_context + prev_layer.temporal_states
def test_invalid_keep_prob(self): """Tests invalid dropout values""" var = tf.constant(np.arange(5)) train_mode = tf.constant(True) for kprob in [-1, 2, 0]: with self.assertRaises(ValueError): dropout(var, kprob, train_mode)
def func( train_mode: tf.Tensor, rnn_size: int, encoders: List[TemporalStatefulWithOutput]) -> tf.Tensor: if len(encoders) != 1: raise ValueError("Exactly one encoder required for this type of " "projection. {} given.".format(len(encoders))) encoder = encoders[0] # shape (batch, time) masked_sum = tf.reduce_sum( encoder.temporal_states * tf.expand_dims(encoder.temporal_mask, 2), 1) # shape (batch, 1) lengths = tf.reduce_sum(encoder.temporal_mask, 1, keepdims=True) means = masked_sum / lengths encoder_rnn_size = means.get_shape()[1].value kernel_initializer = orthogonal_initializer() if encoder_rnn_size != rnn_size: kernel_initializer = None return dropout( tf.layers.dense( means, rnn_size, activation=tf.tanh, kernel_initializer=get_initializer( "encoders_projection/kernel", kernel_initializer), name="encoders_projection"), dropout_keep_prob, train_mode)
def initial_state(self) -> tf.Tensor: """Compute initial decoder state. The part of the computation graph that computes the initial state of the decoder. """ with tf.variable_scope("initial_state"): # pylint: disable=not-callable initial_state = dropout( self.encoder_projection(self.train_mode, self.rnn_size, self.encoders), self.dropout_keep_prob, self.train_mode) # pylint: enable=not-callable init_state_shape = initial_state.get_shape() # Broadcast the initial state to the whole batch if needed if len(init_state_shape) == 1: assert init_state_shape[0].value == self.rnn_size tiles = tf.tile(initial_state, tf.expand_dims(self.batch_size, 0)) initial_state = tf.reshape(tiles, [-1, self.rnn_size]) return initial_state
def states(self) -> tf.Tensor: if self.hidden_dim is None: return self.concatenated_inputs states = tf.layers.dense( self.concatenated_inputs, self.hidden_dim, self.activation, name="hidden_layer") return dropout(states, self.dropout_keep_prob, self.train_mode)
def test_train_false(self): """Checks that dropout is not used when not training""" var = tf.ones([10000]) s = tf.Session() dropped_var = dropout(var, 0.1, tf.constant(False)) dropped_size = tf.reduce_sum(dropped_var) dsize = s.run(dropped_size) self.assertTrue(dsize == 10000)
def embedded_inputs(self) -> tf.Tensor: with tf.variable_scope("input_projection"): embedding_matrix = get_variable( "word_embeddings", [len(self.vocabulary), self.embedding_size], initializer=tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform")) return dropout( tf.nn.embedding_lookup(embedding_matrix, self.inputs), self.dropout_keep_prob, self.train_mode)
def attention(self, query: tf.Tensor, decoder_prev_state: tf.Tensor, decoder_input: tf.Tensor, loop_state: MultiHeadLoopState) -> Tuple[tf.Tensor, MultiHeadLoopState]: """Run a multi-head attention getting context vector for a given query. This method is an API-wrapper for the global function 'attention' defined in this module. Transforms a query of shape(batch, query_size) to shape(batch, 1, query_size) and applies the attention function. Output context has shape(batch, 1, value_size) and weights have shape(batch, n_heads, 1, time(k)). The output is then processed to produce output vector of contexts and the following attention loop state. Arguments: query: Input query for the current decoding step of shape(batch, query_size). decoder_prev_state: Previous state of the decoder. decoder_input: Input to the RNN cell of the decoder. loop_state: Attention loop state. Returns: Vector of contexts and the following attention loop state. """ context_3d, weights_4d = attention( queries=tf.expand_dims(query, 1), keys=self.attention_keys, values=self.attention_values, keys_mask=self.attention_mask, num_heads=self.n_heads, dropout_callback=lambda x: dropout( x, self.dropout_keep_prob, self.train_mode)) # head_weights_3d is HEAD-wise list of (batch, 1, 1, time(keys)) head_weights_3d = tf.split(weights_4d, self.n_heads, axis=1) context = tf.squeeze(context_3d, axis=1) head_weights = [tf.squeeze(w, axis=[1, 2]) for w in head_weights_3d] next_contexts = tf.concat( [loop_state.contexts, tf.expand_dims(context, 0)], axis=0) next_head_weights = [ tf.concat([loop_state.head_weights[i], tf.expand_dims(head_weights[i], 0)], axis=0) for i in range(self.n_heads)] next_loop_state = MultiHeadLoopState( contexts=next_contexts, head_weights=next_head_weights) return context, next_loop_state
def encoder_inputs(self) -> tf.Tensor: inputs = self.input_sequence.temporal_states if self.target_space_id is not None: inputs += tf.reshape(self.target_modality_embedding, [1, 1, -1]) length = tf.shape(inputs)[1] if self.use_positional_encoding: inputs += position_signal(self.model_dimension, length) return dropout(inputs, self.dropout_keep_prob, self.train_mode)
def func(train_mode: tf.Tensor, rnn_size: int, encoders: List[Stateful]) -> tf.Tensor: if rnn_size is None: raise ValueError( "You must supply rnn_size for this type of encoder projection") en_concat = concat_encoder_projection(train_mode, None, encoders) return dropout( tf.layers.dense(en_concat, rnn_size, name="encoders_projection"), dropout_keep_prob, train_mode)
def feedforward_sublayer(self, layer_input: tf.Tensor) -> tf.Tensor: """Create the feed-forward network sublayer.""" # Layer normalization normalized_input = layer_norm(layer_input) # Feed-forward network hidden layer + ReLU ff_hidden = tf.layers.dense( normalized_input, self.ff_hidden_size, activation=tf.nn.relu, name="hidden_state") # Apply dropout on the activations ff_hidden = dropout(ff_hidden, self.dropout_keep_prob, self.train_mode) # Feed-forward output projection ff_output = tf.layers.dense(ff_hidden, self.dimension, name="output") # Apply dropout on the output projection ff_output = dropout(ff_output, self.dropout_keep_prob, self.train_mode) # Add residual connections return ff_output + layer_input
def rnn(self) -> Tuple[tf.Tensor, tf.Tensor]: layer_input = self.rnn_input # type: tf.Tensor # pylint: disable=unsubscriptable-object layer_final = self.rnn_input[:, -1] # pylint: enable=unsubscriptable-object for i, rnn_spec in enumerate(self.rnn_specs): with tf.variable_scope("rnn_{}_{}".format(i, rnn_spec.direction), reuse=tf.AUTO_REUSE): if self.add_layer_norm: layer_input = layer_norm(layer_input) layer_output, layer_final_output = rnn_layer( layer_input, self.input_sequence.lengths, rnn_spec) layer_output = dropout( layer_output, self.dropout_keep_prob, self.train_mode) layer_final_output = dropout( layer_final_output, self.dropout_keep_prob, self.train_mode) in_dim = layer_input.get_shape()[-1] out_dim = layer_output.get_shape()[-1] if self.add_residual and in_dim == out_dim: layer_input += layer_output layer_final += layer_final_output else: # pylint: disable=redefined-variable-type layer_input = layer_output layer_final = layer_final_output # pylint: enable=redefined-variable-type assert layer_final is not None if self.include_final_layer_norm: return layer_norm(layer_input), layer_norm(layer_final) return layer_input, layer_final
def input_plus_attention(self, *args) -> tf.Tensor: """Merge input and previous attentions. Input and previous attentions are merged into a single vector of the size fo embedding. """ loop_state = LoopState(*args) feedables = loop_state.feedables emb_with_ctx = tf.concat( [feedables.embedded_input] + feedables.prev_contexts, 1) return dropout( tf.layers.dense(emb_with_ctx, self.embedding_size), self.dropout_keep_prob, self.train_mode)
def test_keep_prob(self): """Counts dropped items and compare with the expectation""" var = tf.ones([10000]) s = tf.Session() for kprob in [0.1, 0.7]: dropped_var = dropout(var, kprob, tf.constant(True)) dropped_size = tf.reduce_sum( tf.to_int32(tf.equal(dropped_var, 0.0))) dsize = s.run(dropped_size) expected_dropped_size = 10000 * (1 - kprob) self.assertTrue(np.isclose(expected_dropped_size, dsize, atol=500))
def embed_input_symbol(self, inputs: tf.Tensor) -> tf.Tensor: embedded = tf.nn.embedding_lookup(self.embedding_matrix, inputs) if (self.embeddings_source is not None and self.embeddings_source.scale_embeddings_by_depth): # Pylint @property-related bug # pylint: disable=no-member embedding_size = self.embedding_matrix.shape.as_list()[-1] # pylint: enable=no-member embedded *= math.sqrt(embedding_size) length = tf.shape(inputs)[1] return dropout(embedded + position_signal(self.dimension, length), self.dropout_keep_prob, self.train_mode)
def _projection(prev_state, prev_output, ctx_tensors, train_mode): ctx_concat = tf.concat(ctx_tensors, 1) logit_rnn = tf.layers.dense( prev_state, output_size, kernel_initializer=get_initializer("rnn_state/kernel", None), name="rnn_state") logit_emb = tf.layers.dense( prev_output, output_size, kernel_initializer=get_initializer("prev_out/kernel", None), name="prev_out") logit_ctx = tf.layers.dense( ctx_concat, output_size, kernel_initializer=get_initializer("context/kernel", None), name="context") return dropout(activation_fn(logit_rnn + logit_emb + logit_ctx), dropout_keep_prob, train_mode)
def logits(self) -> tf.Tensor: embeddings = self.embedded_sequence.embedding_matrix if not self.train_embeddings: embeddings = tf.stop_gradient(embeddings) states = self.states # pylint: disable=no-member states_dim = self.states.get_shape()[-1].value # pylint: enable=no-member embedding_dim = self.embedded_sequence.embedding_sizes[0] # pylint: disable=redefined-variable-type if states_dim != embedding_dim: states = tf.layers.dense( states, embedding_dim, name="project_for_embeddings") states = dropout(states, self.dropout_keep_prob, self.train_mode) # pylint: enable=redefined-variable-type reshaped_states = tf.reshape(states, [-1, embedding_dim]) reshaped_logits = tf.matmul( reshaped_states, embeddings, transpose_b=True, name="logits") return tf.reshape( reshaped_logits, [self.batch_size, -1, len(self.vocabulary)])
def callback(x: tf.Tensor) -> tf.Tensor: return dropout(x, prob, self.train_mode)
def _attention_tensor(self) -> tf.Tensor: return dropout(self.states, self.dropout_keep_prob, self.train_mode)
def rnn_input(self) -> tf.Tensor: return dropout(self.input_sequence.temporal_states, self.dropout_keep_prob, self.train_mode)
def _projection(prev_state, prev_output, ctx_tensors, train_mode): state_out_ctx = tf.concat([prev_state, prev_output] + ctx_tensors, 1) return dropout( tf.layers.dense( state_out_ctx, output_size, activation=activation_fn), dropout_keep_prob, train_mode)
def _projection(prev_state, prev_output, ctx_tensors, train_mode): state_out_ctx = tf.concat([prev_state, prev_output] + ctx_tensors, 1) return dropout( maxout(state_out_ctx, maxout_size), dropout_keep_prob, train_mode)
def next_state(self, loop_state: LoopState) -> Tuple[tf.Tensor, Any, Any]: rnn_feedables = loop_state.feedables.other rnn_histories = loop_state.histories.other with tf.variable_scope(self.step_scope): rnn_input = self.input_projection(*loop_state) cell = self._get_rnn_cell() if self._rnn_cell_str in ["GRU", "NematusGRU"]: cell_output, next_state = cell( rnn_input, rnn_feedables.prev_rnn_output) attns = [ a.attention( cell_output, rnn_feedables.prev_rnn_output, rnn_input, att_loop_state) for a, att_loop_state in zip( self.attentions, rnn_histories.attention_histories)] if self.attentions: contexts, att_loop_states = zip(*attns) else: contexts, att_loop_states = [], [] if self._conditional_gru: cell_cond = self._get_conditional_gru_cell() cond_input = tf.concat(contexts, -1) cell_output, next_state = cell_cond( cond_input, next_state, scope="cond_gru_2_cell") elif self._rnn_cell_str == "LSTM": prev_state = tf.contrib.rnn.LSTMStateTuple( rnn_feedables.prev_rnn_state, rnn_feedables.prev_rnn_output) cell_output, state = cell(rnn_input, prev_state) next_state = state.c attns = [ a.attention( cell_output, rnn_feedables.prev_rnn_output, rnn_input, att_loop_state) for a, att_loop_state in zip( self.attentions, rnn_histories.attention_histories)] if self.attentions: contexts, att_loop_states = zip(*attns) else: contexts, att_loop_states = [], [] else: raise ValueError("Unknown RNN cell.") # TODO: attention functions should apply dropout on output # themselves before returning the tensors contexts = [dropout(ctx, self.dropout_keep_prob, self.train_mode) for ctx in list(contexts)] cell_output = dropout( cell_output, self.dropout_keep_prob, self.train_mode) with tf.name_scope("rnn_output_projection"): if self.embedding_size != self.output_dimension: raise ValueError( "The dimension ({}) of the output projection must be " "same as the dimension of the input embedding " "({})".format(self.output_dimension, self.embedding_size)) # pylint: disable=not-callable output = self.output_projection( cell_output, loop_state.feedables.embedded_input, list(contexts), self.train_mode) # pylint: enable=not-callable new_feedables = RNNFeedables( prev_rnn_state=next_state, prev_rnn_output=cell_output, prev_contexts=list(contexts)) new_histories = RNNHistories( rnn_outputs=append_tensor(rnn_histories.rnn_outputs, cell_output), attention_histories=list(att_loop_states)) return (output, new_feedables, new_histories)
def embed_input_symbols(self, input_symbols: tf.Tensor) -> tf.Tensor: embedded_input = tf.nn.embedding_lookup( self.embedding_matrix, input_symbols) return dropout(embedded_input, self.dropout_keep_prob, self.train_mode)
def _attention_states_dropped(self) -> tf.Tensor: return dropout(get_attention_states(self.input_sequence), self.dropout_keep_prob, self.train_mode)
def callback(x: tf.Tensor) -> tf.Tensor: return dropout(x, prob, self.train_mode)
def attention_states(self) -> tf.Tensor: return dropout(get_attention_states(self.encoder), self.dropout_keep_prob, self.train_mode)
def _logit_function(self, state: tf.Tensor) -> tf.Tensor: state = dropout(state, self.dropout_keep_prob, self.train_mode) return tf.matmul(state, self.decoding_w) + self.decoding_b
def embed_input_symbol(self, *args) -> tf.Tensor: loop_state = LoopState(*args) embedded_input = tf.nn.embedding_lookup( self.embedding_matrix, loop_state.feedables.input_symbol) return dropout(embedded_input, self.dropout_keep_prob, self.train_mode)
def rnn_input(self) -> tf.Tensor: return dropout(self.input_sequence.temporal_states, self.dropout_keep_prob, self.train_mode)
def __init__(self, name: str, vocabulary: Vocabulary, data_id: str, embedding_size: int, filters: List[Tuple[int, int]], max_input_len: Optional[int] = None, dropout_keep_prob: float = 1.0, save_checkpoint: Optional[str] = None, load_checkpoint: Optional[str] = None) -> None: """Creates a new instance of the CNN sequence encoder. Based on: Yoon Kim: Convolutional Neural Networks for Sentence Classification (http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf) Arguments: vocabulary: Input vocabulary data_id: Identifier of the data series fed to this encoder name: An unique identifier for this encoder max_input_len: Maximum length of an encoded sequence embedding_size: The size of the embedding vector assigned to each word filters: Specification of CNN filters. It is a list of tuples specifying the filter size and number of channels. dropout_keep_prob: The dropout keep probability (default 1.0) """ ModelPart.__init__(self, name, save_checkpoint, load_checkpoint) assert check_argument_types() self.vocabulary = vocabulary self.data_id = data_id self.max_input_len = max_input_len with tf.variable_scope(self.name): self.train_mode = tf.placeholder(tf.bool, shape=[], name="mode_placeholder") self.inputs = tf.placeholder(tf.int32, shape=[None, None], name="encoder_input") self._input_mask = tf.placeholder(tf.float32, shape=[None, None], name="encoder_padding") with tf.variable_scope("input_projection"): self.embedding_matrix = tf.get_variable( "word_embeddings", [len(vocabulary), embedding_size], initializer=tf.random_normal_initializer(stddev=0.01)) embedded_inputs = dropout( tf.nn.embedding_lookup(self.embedding_matrix, self.inputs), dropout_keep_prob, self.train_mode) pooled_outputs = [] for filter_size, num_filters in filters: with tf.variable_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, embedding_size, num_filters] w_filter = tf.get_variable( "conv_W", filter_shape, initializer=tf.random_uniform_initializer(-0.5, 0.5)) b_filter = tf.get_variable( "conv_bias", [num_filters], initializer=tf.constant_initializer(0.0)) conv = tf.nn.conv1d(embedded_inputs, w_filter, stride=1, padding="VALID", name="conv") # Apply nonlinearity conv_relu = tf.nn.relu(tf.nn.bias_add(conv, b_filter)) # Max-pooling over the outputs pooled = tf.reduce_max(conv_relu, 1) pooled_outputs.append(pooled) # Combine all the pooled features self.encoded = tf.concat(pooled_outputs, axis=1)
def __init__(self, name: str, data_id: str, rnn_size: int, input_dimension: int, max_input_len: Optional[int] = None, dropout_keep_prob: float = 1.0, attention_type: Optional[Any] = None, save_checkpoint: Optional[str] = None, load_checkpoint: Optional[str] = None) -> None: """Creates a new instance of the encoder. Arguments: data_id: Identifier of the data series fed to this encoder name: An unique identifier for this encoder rnn_size: The size of the encoder's hidden state. Note that the actual encoder output state size will be twice as long because it is the result of concatenation of forward and backward hidden states. Keyword arguments: dropout_keep_prob: The dropout keep probability (default 1.0) attention_type: The class that is used for creating attention mechanism (default None) """ ModelPart.__init__(self, name, save_checkpoint, load_checkpoint) Attentive.__init__(self, attention_type) assert check_argument_types() self.data_id = data_id self.rnn_size = rnn_size self.max_input_len = max_input_len self.input_dimension = input_dimension self.dropout_keep_p = dropout_keep_prob log("Initializing RNN encoder, name: '{}'".format(self.name)) with tf.variable_scope(self.name): self._create_input_placeholders() self._input_mask = tf.sequence_mask(self._input_lengths, dtype=tf.float32) fw_cell, bw_cell = self.rnn_cells() # type: RNNCellTuple outputs_bidi_tup, encoded_tup = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, self.inputs, self._input_lengths, dtype=tf.float32) self.hidden_states = tf.concat(outputs_bidi_tup, 2) with tf.variable_scope('attention_tensor'): self.__attention_tensor = dropout(self.hidden_states, self.dropout_keep_p, self.train_mode) self.encoded = tf.concat(encoded_tup, 1) log("RNN encoder initialized")
def _attention_states_dropped(self) -> tf.Tensor: return dropout(get_attention_states(self.input_sequence), self.dropout_keep_prob, self.train_mode)
def attention_states(self) -> tf.Tensor: return dropout(get_attention_states(self.encoder), self.dropout_keep_prob, self.train_mode)
def __init__(self, name: str, data_id: str, input_size: int, rnn_layers: List[RNNSpecTuple], max_input_len: Optional[int] = None, dropout_keep_prob: float = 1.0, save_checkpoint: Optional[str] = None, load_checkpoint: Optional[str] = None) -> None: """Create a new instance of the encoder. Arguments: data_id: Identifier of the data series fed to this encoder name: An unique identifier for this encoder rnn_layers: A list of tuples specifying the size and, optionally, the direction ('forward', 'backward' or 'bidirectional') and cell type ('GRU' or 'LSTM') of each RNN layer. Keyword arguments: dropout_keep_prob: The dropout keep probability (default 1.0) """ check_argument_types() ModelPart.__init__(self, name, save_checkpoint, load_checkpoint) self.data_id = data_id self._rnn_layers = [_make_rnn_spec(*r) for r in rnn_layers] self.max_input_len = max_input_len self.input_size = input_size self.dropout_keep_prob = dropout_keep_prob log("Initializing RNN encoder, name: '{}'".format(self.name)) with self.use_scope(): self._create_input_placeholders() self.states_mask = tf.sequence_mask(self._input_lengths, dtype=tf.float32) states = self.inputs states_reversed = False def reverse_states(): nonlocal states, states_reversed states = tf.reverse_sequence(states, self._input_lengths, batch_dim=0, seq_dim=1) states_reversed = not states_reversed for i, layer in enumerate(self._rnn_layers): with tf.variable_scope("rnn_{}_{}".format(i, layer.direction)): cell = _make_rnn_cell(layer) if layer.direction == "bidirectional": outputs_tup, encoded_tup = ( tf.nn.bidirectional_dynamic_rnn( cell(), cell(), states, self._input_lengths, dtype=tf.float32)) if states_reversed: # treat forward as backward and vice versa outputs_tup = tuple(reversed(outputs_tup)) encoded_tup = tuple(reversed(encoded_tup)) states_reversed = False states = tf.concat(outputs_tup, 2) encoded = tf.concat(encoded_tup, 1) elif layer.direction in ["forward", "backward"]: should_be_reversed = (layer.direction == "backward") if states_reversed != should_be_reversed: reverse_states() states, encoded = tf.nn.dynamic_rnn( cell(), states, sequence_length=self._input_lengths, dtype=tf.float32) else: raise ValueError("Unknown RNN direction {}".format( layer.direction)) if i < len(self._rnn_layers) - 1: states = dropout(states, self.dropout_keep_prob, self.train_mode) if states_reversed: reverse_states() self.hidden_states = states self.encoded = encoded log("RNN encoder initialized")