def __call__(self, inputs, lengths=None, is_training=None): """ :param inputs: [B, T, input_dim] :param lengths: [B] :return: [B, T, output_units] """ with tf.variable_scope(self.name, reuse=self.reuse): h = inputs for i, layer in enumerate(self._hidden_layers): if type(layer) in [tuple, list]: h = dense(inputs=h, units=layer[0], activation=layer[1], dropout=self._dropout, is_training=is_training) elif type(layer) is FeedForwardEncoder: h = layer(h, lengths=lengths, is_training=is_training) else: raise ValueError('Unknown type of layer: %s' % type(layer)) outputs = dense(inputs=h, units=self._num_units, activation=self._activation_fn, dropout=self._dropout, is_training=is_training) return outputs
def attention_logits( inputs, # [B, M, d] outputs, # [B, N, d] units, dropout=None, is_training=None, name='attention-layer', reuse=None): with tf.variable_scope(name, reuse=reuse): # [B, M, d] keys = dense( inputs=inputs, # [B, M, dx] units=units, activation=None, name='keys', dropout=dropout, is_training=is_training) # [B, N, d] queries = dense( inputs=outputs, # [B, N, dy] units=units, activation=None, name='queries', dropout=dropout, is_training=is_training) # prediction of Categorical parameters of P(A_j|x_1^m, y_<j) via dot product # [B, N, M] logits = tf.matmul( queries, # [B, N, dh] keys, # [B, M, dh] transpose_b=True) # [B, N, M] return logits
def delta_tensor(shape, inputs, var_name, activation=tf.identity, initializer=None, summary=False, dropout=0., is_training=None): """ Returns a tensor filled with `dim` parameters or predictions. :param shape: [batch_size, max_time, dim] :param inputs: if not None, then we predict a tensor [batch_size, max_time, dim] otherwise we tile `dim` parameters to make a [batch_size, max_time, dim] tensor :param var_name: :param activation: :param summary: :param dropout: :param is_training: :return: """ dim = shape[-1] if inputs is None: logging.info(' %s = tensor(%s) with t a parameter', var_name, pprint_activation(activation, 't')) # here we have `dim` parameters with tf.variable_scope(var_name): outputs = activation( tf.get_variable(name='pre_t', shape=dim if initializer is None else None, dtype=tf.float32, initializer=initializer)) expanded_shape = [1] * len(shape) expanded_shape[-1] = dim # e.g. [1, 1, dim] copies = list(shape) copies[-1] = 1 # e.g. [B, T, 1] outputs = tf.tile(tf.reshape(outputs, expanded_shape), copies) if summary: tf.summary.histogram('parameter', outputs) else: logging.info(' %s = tensor(%s) with t a prediction', var_name, pprint_activation(activation, 't')) # here we predict a tensor with shape `shape` with tf.variable_scope(var_name): outputs = dense(inputs=inputs, units=dim, activation=activation, dropout=dropout, is_training=is_training) if summary: tf.summary.histogram('prediction', outputs) return outputs
def __call__(self, inputs, lengths=None, is_training=None): """ :param inputs: [B, T, input_dim] :param lengths: [B] :return: [B, T, output_units] """ with tf.variable_scope(self.name, reuse=self.reuse): outputs = dense(inputs=inputs, units=self._num_units, activation=self._activation_fn, use_bias=self._use_bias, dropout=self._dropout, is_training=is_training) return outputs
def self_attention_layer(inputs, num_steps, units, dropout=None, is_training=None, activation=tf.nn.softmax, mask_diagonal=False, mask_value=float('-inf'), name='SelfAttention', reuse=None): """ Compute self attention levels (masking invalid positions). :param inputs: [batch_size, max_time, dim] :param num_steps: number of steps per training instance [batch_size] :param units: number of query/key units :param activation: defaults to tf.nn.softmax for normalised attention :param mask_diagonal: defaults to False :param mask_value: defaults to -inf :param name: defaults to SelfAttention :param reuse: passed to tf layers (defaults to None) :return: [batch_size, max_time, max_time] """ batch_size = tf.shape(inputs)[0] # B longest = tf.shape(inputs)[1] # M with tf.variable_scope(name): # [B, M, d] queries = dense(inputs, units=units, dropout=dropout, is_training=is_training, name='queries', reuse=reuse) keys = dense(inputs, units=units, dropout=dropout, is_training=is_training, name='keys', reuse=reuse) # [B, M, M] scores = tf.matmul( queries, # [B, M, d] keys, # [B, M, d] transpose_b=True) # mask invalid logits # [B, M, M] condition = tf.tile( # make the boolean mask [B, 1, M] tf.expand_dims( # get a boolean mask [B, M] tf.sequence_mask(num_steps, maxlen=longest), 1), [1, longest, 1]) scores = tf.where( # make the boolean mask [B, M, M] condition=condition, x=scores, y=tf.ones(shape=[batch_size, longest, longest]) * mask_value) # mask diagonal if mask_diagonal: scores += tf.diag(tf.fill([tf.shape(scores)[-1]], mask_value)) # Normalise attention # [B, M, M] #outputs = tf.where( # condition=condition, # x=activation(scores), # y=tf.zeros_like(scores) #) return activation(scores)
def build(self, inputs, lengths=None, is_training=None): """ :param inputs: [B, T, input_dim] :param lengths: [B] :return: outputs [B, T, output_units], states [B, T, output_units] """ if self._cell_type == 'lstm': cell_class = tf.contrib.rnn.BasicLSTMCell elif self._cell_type == 'gru': cell_class = tf.contrib.rnn.GRUCell else: raise ValueError('Unknown cell_type=%s' % self._cell_type) with tf.variable_scope(self.name, reuse=self.reuse): # decide how many units per cell if self._merge_strategy == 'half': num_units = self._num_units // 2 else: num_units = self._num_units cell_fw = cell_class(num_units=num_units) cell_bw = cell_class(num_units=num_units) if self._dropout > 0.: if is_training is None: raise ValueError('With dropout I require a training flag') keep_prob = 1. - self._dropout cell_fw = tf.contrib.rnn.DropoutWrapper( cell_fw, output_keep_prob=tf.where(is_training, keep_prob, 1.), state_keep_prob=tf.where(is_training, keep_prob, 1.), variational_recurrent=self._variational_recurrent, dtype=inputs.dtype, input_size=lengths # [B] ) cell_bw = tf.contrib.rnn.DropoutWrapper( cell_bw, output_keep_prob=tf.where(is_training, keep_prob, 1.), state_keep_prob=tf.where(is_training, keep_prob, 1.), variational_recurrent=self._variational_recurrent, dtype=inputs.dtype, input_size=lengths # [B] ) with tf.variable_scope( dynamic_rnn_scope_name(self._dropout, self._variational_recurrent)): (outputs_fw, outputs_bw), (states_fw, states_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=inputs, sequence_length=lengths, dtype=tf.float32) if self._merge_strategy == 'sum': # [B, T, num_units] outputs = tf.add(outputs_fw, outputs_bw) states = tf.add(states_fw, states_bw) else: # [B, T, num_units * 2] outputs = tf.concat([outputs_fw, outputs_bw], -1) states = tf.concat([states_fw, states_bw], -1) if self._merge_strategy == 'project': # num_units * 2 => output_units outputs = dense(inputs=outputs, units=self.output_units, activation=None, use_bias=False, dropout=self._ff_dropout, is_training=is_training) if self._residual: outputs += inputs return outputs, states
def __call__(self, inputs, lengths=None, is_training=None): """ :param inputs: [B, T, input_units] :param lengths: [B] :return: outputs=inputs [B, T, output_units=input_units] """ if lengths is None: raise ValueError('I need lengths to figure out what is the past') memory = self.padleft(inputs) lengthsp1 = lengths + 1 batch_size = tf.shape(memory)[0] longestp1 = tf.shape(memory)[1] with tf.variable_scope(self.name, reuse=self.reuse): # [B, T, d] queries = dense(inputs=memory, units=self.output_units, use_bias=False, dropout=self._dropout, is_training=is_training, name='queries', reuse=self.reuse) keys = memory # [B, T, T] scores = tf.matmul( queries, # [B, T, d] keys, # [B, T, d] transpose_b=True) # mask invalid logits inf = tf.fill([batch_size, longestp1, longestp1], float('-inf')) scores = tf.where( # make the boolean mask [B, T, T] condition=tf.tile( # make the boolean mask [B, 1, T] tf.expand_dims( tf.sequence_mask(lengthsp1, maxlen=longestp1), 1), [1, longestp1, 1]), x=scores, y=inf) # mask diagonal past # [B, T, T] past_mask = tf.cast( fill_triangular( tf.ones([batch_size, longestp1 * (longestp1 + 1) // 2 ]) # lower triangular (including diag) ), dtype=tf.bool) scores = tf.where( # make the boolean mask [B, T, T] condition=past_mask, x=scores, y=inf) # [B, T, T] adjacencies = tf.nn.softmax(scores) outputs = tf.matmul( adjacencies, # [B, T, T] memory # [B, T, d] ) outputs = self.trimright(outputs) return outputs