def call(self, x, mask=None): eij = K.dot(x, self.W) if self.use_bias: eij = K.bias_add(eij, self.bias) if self.activation == 'tanh': eij = K.tanh(eij) elif self.activation == 'relu': eij = K.relu(eij) else: eij = eij ai = K.exp(eij) weights = ai / K.sum(ai, axis=1, keepdims=True) weighted_input = x * weights return K.sum(weighted_input, axis=1)
def _time_distributed_dense(x, w, b=None, dropout=None, input_dim=None, output_dim=None, timesteps=None, training=None): """Apply `y . w + b` for every temporal slice y of x. # Arguments x: input tensor. w: weight matrix. b: optional bias vector. dropout: wether to apply dropout (same dropout mask for every temporal slice of the input). input_dim: integer; optional dimensionality of the input. output_dim: integer; optional dimensionality of the output. timesteps: integer; optional number of timesteps. training: training phase tensor or boolean. # Returns Output tensor. """ if not input_dim: input_dim = K.shape(x)[2] if not timesteps: timesteps = K.shape(x)[1] if not output_dim: output_dim = K.int_shape(w)[1] if dropout is not None and 0. < dropout < 1.: # apply the same dropout pattern at every timestep ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim))) dropout_matrix = K.dropout(ones, dropout) expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps) x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training) # collapse time dimension and batch dimension together x = K.reshape(x, (-1, input_dim)) x = K.dot(x, w) if b is not None: x = K.bias_add(x, b) # reshape to 3D tensor if K.backend() == 'tensorflow': x = K.reshape(x, K.stack([-1, timesteps, output_dim])) x.set_shape([None, None, output_dim]) else: x = K.reshape(x, (-1, timesteps, output_dim)) return x
def call(self, x, mask=None): # size of x :[batch_size, sel_len, attention_dim] # size of u :[batch_size, attention_dim] # uit = tanh(xW+b) uit = K.tile(K.expand_dims(self.W, axis=0), (K.shape(x)[0], 1, 1)) uit = tf.matmul(x, uit) uit = K.tanh(K.bias_add(uit, self.b)) ait = K.dot(uit, self.u) ait = K.squeeze(ait, -1) ait = K.exp(ait) if mask is not None: # Cast the mask to floatX to avoid float64 upcasting in theano ait *= K.cast(mask, K.floatx()) ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx()) ait = K.expand_dims(ait) weighted_input = x * ait output = K.sum(weighted_input, axis=1) return output
def step(self, inputs, states): h_tm1 = states[0] c_tm1 = states[1] dp_mask = states[2] rec_dp_mask = states[3] x_input = states[4] # alignment model h_att = K.repeat(h_tm1, self.timestep_dim) att = _time_distributed_dense(x_input, self.attention_weights, self.attention_bias, output_dim=K.int_shape( self.attention_weights)[1]) attention_ = self.attention_activation( K.dot(h_att, self.attention_recurrent_weights) + att) attention_ = K.squeeze( K.dot(attention_, self.attention_recurrent_bias), 2) alpha = K.exp(attention_) if dp_mask is not None: alpha *= dp_mask[0] alpha /= K.sum(alpha, axis=1, keepdims=True) alpha_r = K.repeat(alpha, self.input_dim) alpha_r = K.permute_dimensions(alpha_r, (0, 2, 1)) # make context vector (soft attention after Bahdanau et al.) z_hat = x_input * alpha_r context_sequence = z_hat z_hat = K.sum(z_hat, axis=1) if self.implementation == 2: z = K.dot(inputs * dp_mask[0], self.kernel) z += K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel) z += K.dot(z_hat, self.attention_kernel) if self.use_bias: z = K.bias_add(z, self.bias) z0 = z[:, :self.units] z1 = z[:, self.units:2 * self.units] z2 = z[:, 2 * self.units:3 * self.units] z3 = z[:, 3 * self.units:] i = self.recurrent_activation(z0) f = self.recurrent_activation(z1) c = f * c_tm1 + i * self.activation(z2) o = self.recurrent_activation(z3) else: if self.implementation == 0: x_i = inputs[:, :self.units] x_f = inputs[:, self.units:2 * self.units] x_c = inputs[:, 2 * self.units:3 * self.units] x_o = inputs[:, 3 * self.units:] elif self.implementation == 1: x_i = K.dot(inputs * dp_mask[0], self.kernel_i) + self.bias_i x_f = K.dot(inputs * dp_mask[1], self.kernel_f) + self.bias_f x_c = K.dot(inputs * dp_mask[2], self.kernel_c) + self.bias_c x_o = K.dot(inputs * dp_mask[3], self.kernel_o) + self.bias_o else: raise ValueError('Unknown `implementation` mode.') i = self.recurrent_activation( x_i + K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_i) + K.dot(z_hat, self.attention_i)) f = self.recurrent_activation( x_f + K.dot(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_f) + K.dot(z_hat, self.attention_f)) c = f * c_tm1 + i * self.activation( x_c + K.dot(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c) + K.dot(z_hat, self.attention_c)) o = self.recurrent_activation( x_o + K.dot(h_tm1 * rec_dp_mask[3], self.recurrent_kernel_o) + K.dot(z_hat, self.attention_o)) h = o * self.activation(c) if 0 < self.dropout + self.recurrent_dropout: h._uses_learning_phase = True if self.return_attention: return context_sequence, [h, c] else: return h, [h, c]