class BERTPooler(Layer): def __init__(self, config, **kwargs): self.trainable = False self.config = config super().__init__(**kwargs) self.dense = Dense(input_shape=[ config.hidden_size, ], units=config.hidden_size, trainable=False, activation='tanh') def build(self, input_shape): if isinstance(input_shape, tuple) and input_shape[0] is None: pooler_input_shape = [self.config.hidden_size, input_shape[1]] else: pooler_input_shape = [self.config.hidden_size, input_shape] self.dense.build(pooler_input_shape) super(BERTPooler, self).build(input_shape) def call(self, hidden_states, **kwargs): # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0, :] pooled_output = self.dense(first_token_tensor) return [pooled_output]
def wider_next_dense(layer, start_dim, total_dim, n_add): """Get next dense layer for current layer Args: layer: the dense layer from which we search next dense layer n_add: output shape start_dim: the started dimension total_dim: the total dimension Returns: The next dense layer """ n_units = layer.units teacher_w, teacher_b = layer.get_weights() student_w = teacher_w.copy() n_units_each_channel = int(teacher_w.shape[0] / total_dim) new_weight = np.zeros((n_add * n_units_each_channel, teacher_w.shape[1])) student_w = np.concatenate( (student_w[:start_dim * n_units_each_channel], new_weight, student_w[start_dim * n_units_each_channel:total_dim * n_units_each_channel])) new_layer = Dense(n_units, activation=layer.get_config()['activation']) new_layer.build((None, student_w.shape[0])) new_layer.set_weights((student_w, teacher_b)) return new_layer
class BERTIntermediate(Layer): def __init__(self, config, **kwargs): self.config = config self.trainable = False super().__init__(**kwargs) self.dense = Dense(input_shape=(self.config.hidden_size, ), units=self.config.intermediate_size, trainable=False) self.intermediate_act_fn = gelu def build(self, input_shape): self.dense.build( (self.config.intermediate_size, self.config.hidden_size)) super(BERTIntermediate, self).build(input_shape) def call(self, x, **kwargs): hidden_states = x original_shape = hidden_states.shape hidden_states_r = K.reshape(hidden_states, (-1, hidden_states.shape[-1])) hidden_states = self.dense(hidden_states_r) hidden_states_r = K.reshape( hidden_states, (-1, original_shape[1], hidden_states.shape[-1])) hidden_states = self.intermediate_act_fn(hidden_states_r) return hidden_states
class BERTOutput(Layer): def __init__(self, config, **kwargs): self.config = config self.trainable = False super().__init__(**kwargs) self.dense = Dense(input_shape=(config.intermediate_size, ), units=config.hidden_size, trainable=False) self.LayerNorm = BERTLayerNorm(config, trainable=False) self.dropout = Dropout(config.hidden_dropout_prob, trainable=False) def build(self, input_shape): self.dense.build( (self.config.hidden_size, self.config.intermediate_size)) self.LayerNorm.build(self.config.hidden_size) self.dropout.build(self.config.hidden_size) super(BERTOutput, self).build(self.config.hidden_size) def call(self, x, **kwargs): input_tensor, hidden_states = x original_shape = hidden_states.shape hidden_states_r = K.reshape(hidden_states, (-1, hidden_states.shape[-1])) hidden_states = self.dense(hidden_states_r) hidden_states = self.dropout(hidden_states) hidden_states_r = K.reshape( hidden_states, (-1, original_shape[1], hidden_states.shape[-1])) hidden_states = self.LayerNorm(hidden_states_r + input_tensor) return hidden_states
def wider_pre_dense(layer, n_add): """Get previous dense layer for current layer Args: layer: the layer from which we get wide previous dense layer n_add: output shape Returns: The previous dense layer """ n_units1 = layer.get_weights()[0].shape[0] n_units2 = layer.units teacher_w, teacher_b = layer.get_weights() rand = np.random.randint(n_units2, size=n_add) student_w = teacher_w.copy() student_b = teacher_b.copy() # target layer update (i) for i in range(n_add): teacher_index = rand[i] new_weight = teacher_w[:, teacher_index] new_weight = new_weight[:, np.newaxis] student_w = np.concatenate((student_w, new_weight), axis=1) student_b = np.append(student_b, teacher_b[teacher_index]) new_pre_layer = Dense(n_units2 + n_add, input_shape=(n_units1, ), activation='relu') new_pre_layer.build((None, n_units1)) new_pre_layer.set_weights((student_w, student_b)) return new_pre_layer
class AnswerProbability(Layer): def __init__(self, **kwargs): super(AnswerProbability, self).__init__(**kwargs) def build(self, input_shape): # input_shape: (None, 5, 200) self.dense_1 = Dense(1, activation="relu") self.dense_1.build(input_shape[:-1] + (3*input_shape[-1],)) self.trainable_weights = self.dense_1.trainable_weights super(AnswerProbability, self).build(input_shape) def call(self, answer_encoding): score_matrix = tf.matmul(answer_encoding, K.permute_dimensions(answer_encoding, (0, 2, 1))) eye1 = K.eye(Params.max_passage_count); zero1 = K.zeros_like(eye1); mask = K.cast(K.equal(eye1, zero1), dtype="float32") score_matrix = score_matrix * mask score_matrix = Softmax(axis=-1)(score_matrix) answer_encoding_hat = tf.matmul(score_matrix, answer_encoding) answer_encoding_final = K.concatenate([answer_encoding, answer_encoding_hat, answer_encoding*answer_encoding_hat]) answer_probability = self.dense_1(answer_encoding_final) answer_probability = K.squeeze(answer_probability, axis=-1) answer_probability = Softmax(axis=-1)(answer_probability) return answer_probability def compute_output_shape(self, input_shape): return (None, input_shape[1])
def build(self, input_shape): if self.use_task_bias: self.task_bias = {} for F in self.task_features: self.task_bias[F] = self.add_weight( shape=(self.units, ), initializer=self.bias_initializer, name='task_bias%s' % (str(F)), regularizer=self.bias_regularizer, constraint=self.bias_constraint) else: self.task_bias = None if self.use_task_gain: self.task_gain = {} for F in self.task_features: self.task_gain[F] = self.add_weight( shape=(self.units, ), initializer=self.bias_initializer, name='task_gain%s' % (str(F)), regularizer=self.bias_regularizer, constraint=self.bias_constraint) else: self.task_gain = None Dense.build(self, input_shape)
class SpanBegin(Layer): def __init__(self, **kwargs): super(SpanBegin, self).__init__(**kwargs) def build(self, input_shape): last_dim = input_shape[0][-1] + input_shape[1][-1] input_shape_dense_1 = input_shape[0][:-1] + (last_dim, ) self.dense_1 = Dense(units=1) self.dense_1.build(input_shape_dense_1) self.trainable_weights = self.dense_1.trainable_weights super(SpanBegin, self).build(input_shape) def call(self, inputs): merged_context, modeled_passage = inputs span_begin_input = K.concatenate([merged_context, modeled_passage]) span_begin_weights = TimeDistributed(self.dense_1)(span_begin_input) span_begin_probabilities = Softmax()(K.squeeze(span_begin_weights, axis=-1)) return span_begin_probabilities def compute_output_shape(self, input_shape): merged_context_shape, _ = input_shape return merged_context_shape[:-1] def get_config(self): config = super().get_config() return config
class ContentIndice(Layer): def __init__(self, **kwargs): super(ContentIndice, self).__init__(**kwargs) def build(self, input_shape): self.dense_1 = Dense(embedding_dim, activation="relu") self.dense_1.build(input_shape) self.dense_2 = Dense(1, activation="linear") self.dense_2.build(input_shape[:-1] + (embedding_dim, )) self.trainable_weights = self.dense_1.trainable_weights + self.dense_2.trainable_weights super(ContentIndice, self).build(input_shape) def call(self, passage_modeling): passage_representation = self.dense_1(passage_modeling) passage_representation = self.dense_2(passage_representation) passage_representation = K.squeeze(passage_representation, axis=-1) # passage_indices = Softmax(axis=-1)(passage_representation) return passage_representation def compute_output_shape(self, input_shape): return input_shape[:-1] def get_config(self): config = super().get_config() return config
def build(self, input_shape): Dense.build(self, input_shape) if self.needs_drop: self.kernel = K.in_train_phase(K.dropout(self.kernel, self.prob, self.drop_noise_shape), self.kernel) if self.drop_bias: self.bias = K.in_train_phase(K.dropout(self.bias, self.prob, self.drop_noise_shape), self.bias)
class Attention_local(Layer): def __init__(self): # This layer is just to start defining the layer super(Attention_local, self).__init__() def build(self, input_shape ): # This layer defines the shape of the weights and bias self.sequence_length = input_shape[0][ 1] # the number of words (max_len) self.output_dimensions = input_shape[0][ 2] # output dim [hidden vec dimensions] self.W_p = Dense(self.output_dimensions) self.W_p.build(input_shape=(None, None, self.output_dimensions)) # (B, 1, H) self.W_a = Dense(self.output_dimensions) self.W_a.build(input_shape=(None, None, self.output_dimensions)) # (B, 1, H) self.V_p = tf.keras.layers.Dense(1) self.V_p.build(input_shape=(None, None, self.output_dimensions)) self.window_width = WINDOW_WIDTH super(Attention_local, self).build(input_shape) def call(self, inputs): # This is where the action happens # inputs is the input tensor target_hidden_state = inputs[1] # (B , H) source_hidden_state = inputs[0] # (B, S, H) hidden_with_time_axis = tf.expand_dims(input=target_hidden_state, axis=1) # (B, 1, H) # N = W_1(h_t) # M = V(tanh(N)) aligned_position = self.V_p(K.tanh( self.W_p(hidden_with_time_axis))) # (B, 1, 1) # p_t = sigmoid(M) * S aligned_position = K.sigmoid(aligned_position) # (B, 1, 1) aligned_position = aligned_position * self.sequence_length # (B, 1, 1) # α_t=softmax(h_t W_2 h_s) attention_score = K.softmax( source_hidden_state * self.W_a(hidden_with_time_axis)) # (B, S, H) attention_weights = Activation('softmax')(attention_score) # (B, S, H) # α_t (s)= α_t*exp(-((s-p_t)^2)/(2σ^2) gaussian_estimation = lambda i: tf.exp(-2 * tf.square( (i - aligned_position) / self.window_width)) gaussian_factor = gaussian_estimation(0) for x in range(1, self.sequence_length): gaussian_factor = Concatenate(axis=1)( [gaussian_factor, gaussian_estimation(x)]) attention_weights = attention_weights * gaussian_factor # (B, S, H) # C_t= ∑ (α_t (s) * h_s) context_embedding = attention_weights * source_hidden_state # (B, S, H) # Derive context vector by getting the weighted average over the source states context_vector = tf.reduce_sum(context_embedding, axis=1) return context_vector
class SentenceEncoderBlock(Layer): def __init__(self, output_dim, attention_dim, n_heads, dropout=0.3, **kwargs): self.output_dim = output_dim # Es la dimensión de salida del encoder después de las fc self.n_heads = n_heads self.attention_dim = attention_dim # Es la dimensión para dq/dk/dv de multihead attention self.activation = "relu" self.dropout = dropout super(SentenceEncoderBlock, self).__init__(**kwargs) def build(self, input_shape): # "Two linear transformations with a ReLU activation in between" # self.dense_1 = Dense(self.output_dim, activation=self.activation) self.dense_1.build(input_shape) self._trainable_weights += self.dense_1.trainable_weights self.dense_2 = Dense(self.output_dim) self.dense_2.build(input_shape) self._trainable_weights += self.dense_2.trainable_weights # MultiHeadAttention # self.multihead_attention = MultiHeadAttention(self.attention_dim, self.n_heads) self.multihead_attention.build(input_shape) self._trainable_weights += self.multihead_attention.trainable_weights # LayerNorm # self.layer_normalization = LayerNormalization() self.layer_normalization.build(input_shape) self._trainable_weights += self.layer_normalization.trainable_weights super(SentenceEncoderBlock, self).build(input_shape) def compute_mask(self, inputs, mask=None): # Just pass the received mask from previous layer, to the next layer return mask def call(self, x, mask=None): z, all_attns = self.multihead_attention(x) z = K.dropout(z, self.dropout) xz = self.layer_normalization(x + z) h_xz = self.dense_1(xz) h_xz = self.dense_2(h_xz) h_xz = K.dropout(h_xz, self.dropout) h_xz = self.layer_normalization(h_xz + xz) return [h_xz, all_attns] def compute_output_shape(self, input_shape): return [(input_shape[0], input_shape[1], self.output_dim), (input_shape[0], self.n_heads, input_shape[1], input_shape[1])]
def get_dense_controller(controller_output_dim, controller_input_dim, activation='relu', batch_size=1): controller = Dense(controller_output_dim, activation=activation, batch_input_shape=(batch_size, controller_input_dim)) controller.build(input_shape=(batch_size, controller_input_dim)) return controller
def build(self, input_shape): """Creates the layer neurons and connections. Parameters ---------- input_shape: Union[list, tuple, Any] Keras tensor (future input to layer) or list/tuple of Keras tensors to reference for weight shape computations. """ Dense.build(self, input_shape) self.init_neurons(input_shape)
class SpanEnd(Layer): def __init__(self, **kwargs): super(SpanEnd, self).__init__(**kwargs) def build(self, input_shape): emdim = input_shape[0][-1] // 2 input_shape_bilstm_1 = input_shape[0][:-1] + (emdim * 14, ) self.bilstm_1 = Bidirectional(LSTM(emdim, return_sequences=True)) self.bilstm_1.build(input_shape_bilstm_1) input_shape_dense_1 = input_shape[0][:-1] + (emdim * 10, ) self.dense_1 = Dense(units=1) self.dense_1.build(input_shape_dense_1) self.trainable_weights = self.bilstm_1.trainable_weights + self.dense_1.trainable_weights super(SpanEnd, self).build(input_shape) def call(self, inputs): encoded_passage, merged_context, modeled_passage, span_begin_probabilities = inputs weighted_sum = K.sum( K.expand_dims(span_begin_probabilities, axis=-1) * modeled_passage, -2) passage_weighted_by_predicted_span = K.expand_dims(weighted_sum, axis=1) tile_shape = K.concatenate([[1], [K.shape(encoded_passage)[1]], [1]], axis=0) passage_weighted_by_predicted_span = K.tile( passage_weighted_by_predicted_span, tile_shape) multiply1 = modeled_passage * passage_weighted_by_predicted_span span_end_representation = K.concatenate([ merged_context, modeled_passage, passage_weighted_by_predicted_span, multiply1 ]) span_end_representation = self.bilstm_1(span_end_representation) span_end_input = K.concatenate( [merged_context, span_end_representation]) span_end_weights = TimeDistributed(self.dense_1)(span_end_input) span_end_probabilities = Softmax()(K.squeeze(span_end_weights, axis=-1)) return span_end_probabilities def compute_output_shape(self, input_shape): _, merged_context_shape, _, _ = input_shape return merged_context_shape[:-1] def get_config(self): config = super().get_config() return config
def dense_to_deeper_layer(dense_layer): """Get deeper layer for dense layer Args: dense_layer: the dense layer from which we get deeper layer Returns: The deeper dense layer """ units = dense_layer.units weight = np.eye(units) bias = np.zeros(units) new_dense_layer = Dense(units, activation='relu') new_dense_layer.build((None, units)) new_dense_layer.set_weights((weight, bias)) return new_dense_layer
def build(self, input_shape): """Creates the layer neurons and connections. Parameters ---------- input_shape: Union[list, tuple, Any] Keras tensor (future input to layer) or list/tuple of Keras tensors to reference for weight shape computations. """ Dense.build(self, input_shape) self.init_neurons(input_shape) if self.config.getboolean('cell', 'bias_relaxation'): self.b0 = k.variable(k.get_value(self.bias)) self.add_update([(self.bias, self.update_b())])
class Highway(Layer): """ codes from github: https://github.com/batikim09/Keras_highways/blob/master/src/conv2d_highway.py """ activation = None transform_gate_bias = None def __init__(self, activation='relu', transform_gate_bias=-2, **kwargs): self.activation = activation self.transform_gate_bias = transform_gate_bias super(Highway, self).__init__(**kwargs) def build(self, input_shape): # Create a trainable weight variable for this layer. dim = input_shape[-1] self.dense_1 = Dense(units=dim, bias_initializer=Constant( self.transform_gate_bias)) self.dense_1.build(input_shape) self.dense_2 = Dense(units=dim) self.dense_2.build(input_shape) self.trainable_weights = self.dense_1.trainable_weights + self.dense_2.trainable_weights super(Highway, self).build(input_shape) # Be sure to call this at the end def call(self, x): dim = K.int_shape(x)[-1] transform_gate = self.dense_1(x) transform_gate = Activation("sigmoid")(transform_gate) carry_gate = Lambda(lambda x: 1.0 - x, output_shape=(dim, ))(transform_gate) transformed_data = self.dense_2(x) transformed_data = Activation(self.activation)(transformed_data) transformed_gated = Multiply()([transform_gate, transformed_data]) identity_gated = Multiply()([carry_gate, x]) value = Add()([transformed_gated, identity_gated]) return value def compute_output_shape(self, input_shape): return input_shape def get_config(self): config = super().get_config() config['activation'] = self.activation config['transform_gate_bias'] = self.transform_gate_bias return config
class Discriminator(object): def __init__(self, x_k, n_steps, hidden_dim): self.x_k = x_k self.hidden_dim = hidden_dim constraint = lambda: ClipConstraint(1e-2) self.lstm = LSTM(hidden_dim) self.lstm.build((None, n_steps, 1)) for w in self.lstm.trainable_weights: # print("Weight: {}".format(w)) self.lstm.constraints[w] = constraint() self.dense = Dense(1, W_constraint=constraint()) self.dense.build((None, hidden_dim)) self.weights = self.lstm.trainable_weights + self.dense.trainable_weights self.constraints = self.lstm.constraints.copy() self.constraints.update(self.dense.constraints) # print("Constraints: {}".format(self.constraints)) def call(self, x): return self.dense.call(self.lstm.call(x))
class BERTSelfOutput(Layer): def __init__(self, config, **kwargs): self.trainable = False super().__init__(**kwargs) self.config = config self.dense = Dense(input_shape=(self.config.hidden_size, ), units=self.config.hidden_size, trainable=False) self.LayerNorm = BERTLayerNorm(self.config, trainable=False) self.dropout = Dropout(self.config.hidden_dropout_prob, trainable=False) def build(self, input_shape): if isinstance(input_shape, tuple) and input_shape[0] is None: dense_input_shape = (self.config.hidden_size, input_shape[1]) else: dense_input_shape = (self.config.hidden_size, input_shape) self.dense.build(dense_input_shape) self.LayerNorm.build(self.config.hidden_size) self.dropout.build(self.config.hidden_size) super(BERTSelfOutput, self).build(input_shape) def call(self, x, **kwargs): input_tensor, hidden_states = x original_shape = hidden_states.shape hidden_states_r = K.reshape(hidden_states, (-1, hidden_states.shape[-1])) hidden_states = self.dense(hidden_states_r) hidden_states = self.dropout(hidden_states) hidden_states_r = K.reshape(hidden_states, (-1, original_shape[1], original_shape[2])) hidden_states = self.LayerNorm(hidden_states_r + input_tensor) return hidden_states
class SpanBegin(Layer): def __init__(self, **kwargs): super(SpanBegin, self).__init__(**kwargs) def build(self, input_shape): # input_shape: (None, 200, embeddim*8+embeddim*2) self.dense_1 = Dense(units=1) self.dense_1.build((input_shape[0], input_shape[-1])) self.trainable_weights = self.dense_1.trainable_weights super(SpanBegin, self).build(input_shape) def call(self, span_begin_input): span_begin_weights = TimeDistributed(self.dense_1)(span_begin_input) span_begin_probabilities = Softmax()(K.squeeze(span_begin_weights, axis=-1)) return span_begin_probabilities def compute_output_shape(self, input_shape): return input_shape[:-1] def get_config(self): config = super().get_config() return config
class SpanEnd(Layer): def __init__(self, **kwargs): super(SpanEnd, self).__init__(**kwargs) def build(self, input_shape): input_shape_dense_1 = (input_shape[0], embedding_dim*10) self.dense_1 = Dense(units=1) self.dense_1.build(input_shape_dense_1) self.trainable_weights = self.dense_1.trainable_weights super(SpanEnd, self).build(input_shape) def call(self, span_end_input): span_end_weights = TimeDistributed(self.dense_1)(span_end_input) span_end_probabilities = Softmax()(K.squeeze(span_end_weights, axis=-1)) return span_end_probabilities def compute_output_shape(self, input_shape): return input_shape[:-1] def get_config(self): config = super().get_config() return config
class MDN(Layer): """A Mixture Density Network Layer for Keras. This layer has a few tricks to avoid NaNs in the loss function when training: - Activation for variances is ELU + 1 + 1e-8 (to avoid very small values) - Mixture weights (pi) are trained in as logits, not in the softmax space. A loss function needs to be constructed with the same output dimension and number of mixtures. A sampling function is also provided to sample from distribution parametrised by the MDN outputs. """ def __init__(self, output_dimension, num_mixtures, **kwargs): self.output_dim = output_dimension self.num_mix = num_mixtures with tf.name_scope('MDN'): self.mdn_mus = Dense( self.num_mix * self.output_dim, activation=elu_plus_one_plus_epsilon, name='mdn_mus') # mix*output vals, exp activation self.mdn_sigmas = Dense( self.num_mix * self.output_dim, activation=elu_plus_one_plus_epsilon, name='mdn_sigmas') # mix*output vals exp activation self.mdn_pi = Dense( self.num_mix, name='mdn_pi', activation='softmax') # mix vals, softmax activation super(MDN, self).__init__(**kwargs) def build(self, input_shape): self.mdn_mus.build(input_shape) self.mdn_sigmas.build(input_shape) self.mdn_pi.build(input_shape) self.trainable_weights = self.mdn_mus.trainable_weights + self.mdn_sigmas.trainable_weights + self.mdn_pi.trainable_weights self.non_trainable_weights = self.mdn_mus.non_trainable_weights + self.mdn_sigmas.non_trainable_weights + self.mdn_pi.non_trainable_weights super(MDN, self).build(input_shape) def call(self, x, mask=None): with tf.name_scope('MDN'): mdn_out = keras.layers.concatenate( [self.mdn_mus(x), self.mdn_sigmas(x), self.mdn_pi(x)], name='mdn_outputs') return mdn_out def compute_output_shape(self, input_shape): """Returns output shape, showing the number of mixture parameters.""" return (input_shape[0], (2 * self.output_dim * self.num_mix) + self.num_mix) def get_config(self): config = { "output_dimension": self.output_dim, "num_mixtures": self.num_mix } base_config = super(MDN, self).get_config() return dict(list(base_config.items()) + list(config.items()))
class MDN(Layer): def __init__(self, output_dim, num_mixes, kernel='unigaussian', **kwargs): self.output_dim = output_dim self.kernel = kernel self.num_mixes = num_mixes with tf.name_scope('MDN'): self.mdn_mus = Dense(self.num_mixes * self.output_dim, name='mdn_mus') self.mdn_sigmas = Dense(self.num_mixes, activation=K.exp, name='mdn_sigmas') self.mdn_alphas = Dense(self.num_mixes, activation=K.softmax, name='mdn_alphas') super(MDN, self).__init__(**kwargs) def build(self, input_shape): self.mdn_mus.build(input_shape) self.mdn_sigmas.build(input_shape) self.mdn_alphas.build(input_shape) self.trainable_weights = self.mdn_mus.trainable_weights + \ self.mdn_sigmas.trainable_weights + \ self.mdn_alphas.trainable_weights self.non_trainable_weights = self.mdn_mus.non_trainable_weights + \ self.mdn_sigmas.non_trainable_weights + \ self.mdn_alphas.non_trainable_weights self.built = True def call(self, x, mask=None): with tf.name_scope('MDN'): mdn_out = keras.layers.concatenate( [self.mdn_mus(x), self.mdn_sigmas(x), self.mdn_alphas(x)], name='mdn_outputs') return mdn_out def get_output_shape_for(self, input_shape): return (input_shape[0], self.output_dim) def get_config(self): config = { 'output_dim': self.output_dim, 'num_mixes': self.num_mixes, 'kernel': self.kernel } base_config = super(MDN, self).get_config() return dict(list(base_config.items()) + list(config.items())) def get_loss_func(self): def unigaussian_loss(y_true, y_pred): mix = tf.range(start=0, limit=self.num_mixes) out_mu, out_sigma, out_alphas = tf.split( y_pred, num_or_size_splits=[ self.num_mixes * self.output_dim, self.num_mixes, self.num_mixes ], axis=-1, name='mdn_coef_split') def loss_i(i): batch_size = tf.shape(out_sigma)[0] sigma_i = tf.slice(out_sigma, [0, i], [batch_size, 1], name='mdn_sigma_slice') alpha_i = tf.slice(out_alphas, [0, i], [batch_size, 1], name='mdn_alpha_slice') mu_i = tf.slice(out_mu, [0, i * self.output_dim], [batch_size, self.output_dim], name='mdn_mu_slice') dist = tf.distributions.Normal(loc=mu_i, scale=sigma_i) loss = dist.prob( y_true) # find the pdf around each value in y_true loss = alpha_i * loss return loss result = tf.map_fn(lambda m: loss_i(m), mix, dtype=tf.float32, name='mix_map_fn') result = tf.reduce_sum(result, axis=0, keep_dims=False) result = -tf.log(result) result = tf.reduce_mean(result) return result if self.kernel == 'unigaussian': with tf.name_scope('MDNLayer'): return unigaussian_loss
class NSE(Layer): ''' Simple Neural Semantic Encoder. ''' def __init__(self, output_dim, input_length=None, composer_activation='linear', return_mode='last_output', weights=None, **kwargs): ''' Arguments: output_dim (int) input_length (int) composer_activation (str): activation used in the MLP return_mode (str): One of last_output, all_outputs, output_and_memory This is analogous to the return_sequences flag in Keras' Recurrent. last_output returns only the last h_t all_outputs returns the whole sequence of h_ts output_and_memory returns the last output and the last memory concatenated (needed if this layer is followed by a MMA-NSE) weights (list): Initial weights ''' self.output_dim = output_dim self.input_dim = output_dim # Equation 2 in the paper makes this assumption. self.initial_weights = weights self.input_spec = [InputSpec(ndim=3)] self.input_length = input_length self.composer_activation = composer_activation super(NSE, self).__init__(**kwargs) self.reader = LSTM(self.output_dim, return_sequences=True, name="{}_reader".format(self.name)) # TODO: Let the writer use parameter dropout and any consume_less mode. # Setting dropout to 0 here to eliminate the need for constants. # Setting consume_less to mem to eliminate need for preprocessing self.writer = LSTM(self.output_dim, dropout_W=0.0, dropout_U=0.0, consume_less="mem", name="{}_writer".format(self.name)) self.composer = Dense(self.output_dim * 2, activation=self.composer_activation, name="{}_composer".format(self.name)) if return_mode not in [ "last_output", "all_outputs", "output_and_memory" ]: raise Exception("Unrecognized return mode: %s" % (return_mode)) self.return_mode = return_mode def get_output_shape_for(self, input_shape): input_length = input_shape[1] if self.return_mode == "last_output": return (input_shape[0], self.output_dim) elif self.return_mode == "all_outputs": return (input_shape[0], input_length, self.output_dim) else: # return_mode is output_and_memory. Output will be concatenated to memory. return (input_shape[0], input_length + 1, self.output_dim) def compute_mask(self, input, mask): if mask is None or self.return_mode == "last_output": return None elif self.return_mode == "all_outputs": return mask # (batch_size, input_length) else: # Return mode is output_and_memory # Mask memory corresponding to all the inputs that are masked, and do not mask the output # (batch_size, input_length + 1) return K.cast(K.concatenate([K.zeros_like(mask[:, :1]), mask]), 'uint8') def get_composer_input_shape(self, input_shape): # Takes concatenation of output and memory summary return (input_shape[0], self.output_dim * 2) def get_reader_input_shape(self, input_shape): return input_shape def build(self, input_shape): self.input_spec = [InputSpec(shape=input_shape)] input_dim = input_shape[-1] assert self.reader.return_sequences, "The reader has to return sequences!" reader_input_shape = self.get_reader_input_shape(input_shape) print >> sys.stderr, "NSE reader input shape:", reader_input_shape writer_input_shape = (input_shape[0], 1, self.output_dim * 2 ) # Will process one timestep at a time print >> sys.stderr, "NSE writer input shape:", writer_input_shape composer_input_shape = self.get_composer_input_shape(input_shape) print >> sys.stderr, "NSE composer input shape:", composer_input_shape self.reader.build(reader_input_shape) self.writer.build(writer_input_shape) self.composer.build(composer_input_shape) # Aggregate weights of individual components for this layer. reader_weights = self.reader.trainable_weights writer_weights = self.writer.trainable_weights composer_weights = self.composer.trainable_weights self.trainable_weights = reader_weights + writer_weights + composer_weights if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights def read(self, nse_input, input_mask=None): ''' This method produces the 'read' output (equation 1 in the paper) for all timesteps and initializes the memory slot mem_0. Input: nse_input (batch_size, input_length, input_dim) Outputs: o (batch_size, input_length, output_dim) flattened_mem_0 (batch_size, input_length * output_dim) While this method simply copies input to mem_0, variants that inherit from this class can do something fancier. ''' input_to_read = nse_input mem_0 = input_to_read flattened_mem_0 = K.batch_flatten(mem_0) o = self.reader.call(input_to_read, input_mask) o_mask = self.reader.compute_mask(input_to_read, input_mask) return o, [flattened_mem_0], o_mask @staticmethod def summarize_memory(o_t, mem_tm1): ''' This method selects the relevant parts of the memory given the read output and summarizes the memory. Implements Equations 2-3 or 8-11 in the paper. ''' # Selecting relevant memory slots, Equation 2 z_t = K.softmax(K.sum(K.expand_dims(o_t, dim=1) * mem_tm1, axis=2)) # (batch_size, input_length) # Summarizing memory, Equation 3 m_rt = K.sum(K.expand_dims(z_t, dim=2) * mem_tm1, axis=1) # (batch_size, output_dim) return z_t, m_rt def compose_memory_and_output(self, output_memory_list): ''' This method takes a list of tensors and applies the composition function on their concatrnation. Implements equation 4 or 12 in the paper. ''' # Composition, Equation 4 c_t = self.composer.call( K.concatenate(output_memory_list)) # (batch_size, output_dim) return c_t def update_memory(self, z_t, h_t, mem_tm1): ''' This method takes the attention vector (z_t), writer output (h_t) and previous timestep's memory (mem_tm1) and updates the memory. Implements equations 6, 14 or 15. ''' tiled_z_t = K.tile( K.expand_dims(z_t), (self.output_dim)) # (batch_size, input_length, output_dim) input_length = K.shape(mem_tm1)[1] # (batch_size, input_length, output_dim) tiled_h_t = K.permute_dimensions( K.tile(K.expand_dims(h_t), (input_length)), (0, 2, 1)) # Updating memory. First term in summation corresponds to selective forgetting and the second term to # selective addition. Equation 6. mem_t = mem_tm1 * ( 1 - tiled_z_t ) + tiled_h_t * tiled_z_t # (batch_size, input_length, output_dim) return mem_t def compose_and_write_step(self, o_t, states): ''' This method is a step function that updates the memory at each time step and produces a new output vector (Equations 2 to 6 in the paper). The memory_state is flattened because K.rnn requires all states to be of the same shape as the output, because it uses the same mask for the output and the states. Inputs: o_t (batch_size, output_dim) states (list[Tensor]) flattened_mem_tm1 (batch_size, input_length * output_dim) writer_h_tm1 (batch_size, output_dim) writer_c_tm1 (batch_size, output_dim) Outputs: h_t (batch_size, output_dim) flattened_mem_t (batch_size, input_length * output_dim) ''' flattened_mem_tm1, writer_h_tm1, writer_c_tm1 = states input_mem_shape = K.shape(flattened_mem_tm1) mem_tm1_shape = (input_mem_shape[0], input_mem_shape[1] / self.output_dim, self.output_dim) mem_tm1 = K.reshape( flattened_mem_tm1, mem_tm1_shape) # (batch_size, input_length, output_dim) z_t, m_rt = self.summarize_memory(o_t, mem_tm1) c_t = self.compose_memory_and_output([o_t, m_rt]) # Collecting the necessary variables to directly call writer's step function. writer_constants = self.writer.get_constants( c_t) # returns dropouts for W and U (all 1s, see init) writer_states = [writer_h_tm1, writer_c_tm1] + writer_constants # Making a call to writer's step function, Equation 5 h_t, [_, writer_c_t] = self.writer.step( c_t, writer_states) # h_t, writer_c_t: (batch_size, output_dim) mem_t = self.update_memory(z_t, h_t, mem_tm1) flattened_mem_t = K.batch_flatten(mem_t) return h_t, [flattened_mem_t, h_t, writer_c_t] def call(self, x, mask=None): # input_shape = (batch_size, input_length, input_dim). This needs to be defined in build. read_output, initial_memory_states, output_mask = self.read(x, mask) initial_write_states = self.writer.get_initial_states( read_output) # h_0 and c_0 of the writer LSTM initial_states = initial_memory_states + initial_write_states # last_output: (batch_size, output_dim) # all_outputs: (batch_size, input_length, output_dim) # last_states: # last_memory_state: (batch_size, input_length, output_dim) # last_output # last_writer_ct last_output, all_outputs, last_states = K.rnn( self.compose_and_write_step, read_output, initial_states, mask=output_mask) last_memory = last_states[0] if self.return_mode == "last_output": return last_output elif self.return_mode == "all_outputs": return all_outputs else: # return mode is output_and_memory expanded_last_output = K.expand_dims( last_output, dim=1) # (batch_size, 1, output_dim) # (batch_size, 1+input_length, output_dim) return K.concatenate([expanded_last_output, last_memory], axis=1) def get_config(self): config = { 'output_dim': self.output_dim, 'input_length': self.input_length, 'composer_activation': self.composer_activation, 'return_mode': self.return_mode } base_config = super(NSE, self).get_config() config.update(base_config) return config
def build_model(max_length: int, embedding_matrix: Union[np.ndarray, Tuple[int]], transformer_depth: int, transformer_heads: int, filters: List[int], kernel_size: List[int], pool_size: List[int], conv_padding: str, pool_padding: str, dense_size: List[int], loaded_model: Optional[str] = None, fine_tune_model: bool = False, l2_penalty: Optional[float] = None, embedding_dropout: float = 0.6, transformer_dropout: float = 0.1, conv_dropout: float = 0.1, dense_dropout: Union[float, List[float]] = 0.3, classifier_dropout: float = 0.1, train_lm=True) -> Model: if not (len(filters) > 0 and len(kernel_size) > 0 and len(pool_size) > 0): logger.error( "There are no filters, kernel sizes or pool sizes specified for the CNN." ) raise ValueError( "There are no filters, kernel sizes or pool sizes specified for the CNN." ) if type(dense_dropout) != list: dense_dropout = [dense_dropout] if len(dense_size) > 0 and len(dense_size) != len(dense_dropout): max_list_length = max([len(dense_size), len(dense_dropout)]) new_dense_size = [] new_dense_dropout = [] for i in range(max_list_length): new_dense_size.append( dense_size[i] if i < len(dense_size) else dense_size[-1]) new_dense_dropout.append(dense_dropout[i] if i < len(dense_dropout) else dense_dropout[-1]) dense_size = new_dense_size dense_dropout = new_dense_dropout logger.warning( "Lists given for dense layer sizes and dense layer dropout rates are not the same length. " "The shorter lists are padded using the last value to match the length of the longest." ) if len(filters) != len(kernel_size) or len(filters) != len( pool_size) or len(kernel_size) != len(pool_size): max_list_length = max([len(filters), len(kernel_size), len(pool_size)]) new_filters = [] new_kernel_size = [] new_pool_size = [] for i in range(max_list_length): new_filters.append(filters[i] if i < len(filters) else filters[-1]) new_kernel_size.append( kernel_size[i] if i < len(kernel_size) else kernel_size[-1]) new_pool_size.append( pool_size[i] if i < len(pool_size) else pool_size[-1]) filters = new_filters kernel_size = new_kernel_size pool_size = new_pool_size logger.warning( "Lists given for convolutional filters, kernel sizes and pooling sizes had different lengths. " "The shorter lists are padded using the last value to match the length of the longest." ) original_model = None if loaded_model: # load the specified model original_model = load_model(loaded_model, custom_objects={ "perplexity": perplexity, "lm_accuracy": lm_accuracy }) # regularizer for embedding layer l2_regularizer = l2(l2_penalty) if l2_penalty else None # input encoded as integers raw_input = Input(shape=(max_length, ), name="input") # embedding layer, initialised with embedding matrix weights for now embedding_weights = [ original_model.get_layer(name="word_embedding").get_weights()[0] if loaded_model else embedding_matrix ] embedding_layer = ReusableEmbedding( input_dim=(embedding_matrix[0] if type(embedding_matrix) == tuple else embedding_matrix.shape[0]), output_dim=(embedding_matrix[1] if type(embedding_matrix) == tuple else embedding_matrix.shape[1]), input_length=max_length, name="word_embedding", weights=(None if type(embedding_matrix) == tuple and not loaded_model else embedding_weights), embeddings_regularizer=l2_regularizer) # "transpose" of embedding matrix to map back to vocabulary if loaded_model: output_weights = original_model.get_layer( name="word_prediction_logits").get_weights() output_layer = TiedOutputEmbedding( projection_regularizer=l2_regularizer, projection_dropout=embedding_dropout, name="word_prediction_logits", weights=output_weights) else: output_layer = TiedOutputEmbedding( projection_regularizer=l2_regularizer, projection_dropout=embedding_dropout, name="word_prediction_logits") # transformer as taken from here: https://github.com/kpot/keras-transformer/blob/master/example/models.py if loaded_model: position_weights = original_model.get_layer( name="position_embedding").get_weights() position_embedding = TransformerCoordinateEmbedding( max_transformer_depth=1, name="position_embedding", weights=position_weights) else: position_embedding = TransformerCoordinateEmbedding( max_transformer_depth=1, name="position_embedding") transformer_input, embedding_matrix = embedding_layer(raw_input) transformer_output = position_embedding(transformer_input, step=0) for i in range(transformer_depth): block_name = "transformer" + str(i) # define transformer block transformer_block = TransformerBlock( name=block_name, num_heads=transformer_heads, residual_dropout=transformer_dropout, attention_dropout=transformer_dropout, use_masking=True, vanilla_wiring=True) # build the layers in the block because apparently you have to do that if loaded_model: if i == 0: transformer_block.attention_layer.build( original_model.get_layer( "position_embedding").output_shape) else: transformer_block.attention_layer.build( original_model.get_layer( "transformer{}_normalization2".format(i - 1)).output_shape) transformer_block.norm1_layer.build( original_model.get_layer(block_name + "_self_attention").output_shape) transformer_block.norm2_layer.build( original_model.get_layer(block_name + "_normalization1").output_shape) transformer_block.transition_layer.build( original_model.get_layer(block_name + "_normalization1").output_shape) # set weights for all the contained layers manually transformer_block.attention_layer.set_weights( original_model.get_layer( name=(block_name + "_self_attention")).get_weights()) transformer_block.norm1_layer.set_weights( original_model.get_layer( name=(block_name + "_normalization1")).get_weights()) transformer_block.norm2_layer.set_weights( original_model.get_layer( name=(block_name + "_normalization2")).get_weights()) transformer_block.transition_layer.set_weights( original_model.get_layer(name=(block_name + "_transition")).get_weights()) # pass output of last layer through transformer transformer_output = transformer_block(transformer_output) # nothing special to load for softmax softmax_layer = Softmax(name="word_predictions") lm_output_logits = output_layer([transformer_output, embedding_matrix]) lm_output = softmax_layer(lm_output_logits) if not fine_tune_model: m = Model(inputs=raw_input, outputs=lm_output) return m loaded_layer_names = [] if loaded_model: loaded_layer_names = [layer.name for layer in original_model.layers] # convolution layer(s) conv_dropout = Dropout(conv_dropout, name="conv_dropout") conv_output = transformer_output for i in range(len(filters)): # construct and possibly load convolutional layer conv_layer_name = "conv_{}".format(i) convolution = Conv1D(filters[i], kernel_size[i], padding=conv_padding, activation="relu", name=conv_layer_name) if loaded_model and conv_layer_name in loaded_layer_names: layer = original_model.get_layer(name=conv_layer_name) convolution.build(layer.input_shape) convolution.set_weights(layer.get_weights()) # construct max pooling, no weights to load pooling = MaxPooling1D(pool_size[i], padding=pool_padding, name="max_pool_{}".format(i)) # get output/input of next layer conv_output = pooling(convolution(conv_dropout(conv_output))) # dense layer(s) flatten = Flatten(name="flatten") dense_output = flatten(conv_output) for i in range(len(dense_size)): # construct and possibly load dense layer dense_layer_name = "dense_{}".format(i) dense = Dense(dense_size[i], name=dense_layer_name) if loaded_model and dense_layer_name in loaded_layer_names: layer = original_model.get_layer(name=dense_layer_name) dense.build(layer.input_shape) dense.set_weights(layer.get_weights()) # nothing to load for dropout dropout = Dropout(rate=dense_dropout[i], name="dense_dropout_{}".format(i)) # get output dense_output = dense(dropout(dense_output)) # classification layer classifier_dropout = Dropout(classifier_dropout, name="classifier_dropout") classifier = Dense(1, name="classifier") classifier_prediction = Activation("sigmoid", name="classifier_prediction") classifier_output = classifier_prediction( classifier(classifier_dropout(dense_output))) if train_lm: m = Model(inputs=raw_input, outputs=[lm_output, classifier_output]) else: m = Model(inputs=raw_input, outputs=classifier_output) return m
class AttLSTMCell(LSTMCell): """attention Cell class for the LSTM layer. implement type:Bahdanau # Arguments units: Positive integer, dimensionality of the output space. context:[batch_size, sentence_length, embedding_dim] input_len:length of final input activation: Activation function to use (see [activations](../activations.md)). Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation is applied (ie. "linear" activation: `a(x) = x`). recurrent_activation: Activation function to use for the recurrent step (see [activations](../activations.md)). Default: hard sigmoid (`hard_sigmoid`). If you pass `None`, no activation is applied (ie. "linear" activation: `a(x) = x`).x use_bias: Boolean, whether the layer uses a bias vector. kernel_initializer: Initializer for the `kernel` weights matrix, used for the linear transformation of the inputs (see [initializers](../initializers.md)). recurrent_initializer: Initializer for the `recurrent_kernel` weights matrix, used for the linear transformation of the recurrent state (see [initializers](../initializers.md)). bias_initializer: Initializer for the bias vector (see [initializers](../initializers.md)). unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at initialization. Setting it to true will also force `bias_initializer="zeros"`. This is recommended in [Jozefowicz et al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf) kernel_regularizer: Regularizer function applied to the `kernel` weights matrix (see [regularizer](../regularizers.md)). recurrent_regularizer: Regularizer function applied to the `recurrent_kernel` weights matrix (see [regularizer](../regularizers.md)). bias_regularizer: Regularizer function applied to the bias vector (see [regularizer](../regularizers.md)). kernel_constraint: Constraint function applied to the `kernel` weights matrix (see [constraints](../constraints.md)). recurrent_constraint: Constraint function applied to the `recurrent_kernel` weights matrix (see [constraints](../constraints.md)). bias_constraint: Constraint function applied to the bias vector (see [constraints](../constraints.md)). dropout: Float between 0 and 1. Fraction of the units to drop for the linear transformation of the inputs. recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for the linear transformation of the recurrent state. implementation: Implementation mode, either 1 or 2. Mode 1 will structure its operations as a larger number of smaller dot products and additions, whereas mode 2 will batch them into fewer, larger operations. These modes will have different performance profiles on different hardware and for different applications. """ def __init__(self, units, context, context_length, att_hidden_size, att_type="bahdanau", activation='tanh', recurrent_activation='hard_sigmoid', use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', unit_forget_bias=True, kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, dropout=0., recurrent_dropout=0., implementation=1, **kwargs): super(AttLSTMCell, self).__init__(units=units, activation=activation, recurrent_activation=recurrent_activation, use_bias=use_bias, kernel_initializer=kernel_initializer, recurrent_initializer=recurrent_initializer, bias_initializer=bias_initializer, unit_forget_bias=unit_forget_bias, kernel_regularizer=kernel_regularizer, recurrent_regularizer=recurrent_regularizer, bias_regularizer=bias_regularizer, kernel_constraint=kernel_constraint, recurrent_constraint=recurrent_constraint, bias_constraint=bias_constraint, dropout=dropout, recurrent_dropout=recurrent_dropout, implementation=implementation, **kwargs) self.context = context self.context_length = context_length self.att_hidden_size = att_hidden_size self.att_type = att_type def reset_states(self): self.att_hidden_layer.reset_states() self.att_output_layer.reset_states() super(AttLSTMCell, self).reset_states() # def reuse(self, layer, *args, **kwargs): # if not layer.built: # if len(args) > 0: # inputs = args[0] # else: # inputs = kwargs['inputs'] # if isinstance(inputs, list): # input_shape = [K.int_shape(x) for x in inputs] # else: # input_shape = K.int_shape(inputs) # layer.build(input_shape) # outputs = layer.call(*args, **kwargs) # for w in layer.trainable_weights: # if w not in self._trainable_weights: # self._trainable_weights.append(w) # for w in layer.non_trainable_weights: # if w not in self._non_trainable_weights: # self._non_trainable_weights.append(w) # for u in layer.updates: # if not hasattr(self, '_updates'): # self._updates = [] # if u not in self._updates: # self._updates.append(u) # return outputs def build(self, input_shape): att_input_size = input_shape[-1] + self.units # self.kernel_att_hidden = self.add_weight(shape=(att_input_size, self.att_hidden_size), # name='kernel_att', # initializer=self.kernel_initializer, # regularizer=self.kernel_regularizer, # constraint=self.kernel_constraint) # # self.kernel_att_out = self.add_weight(shape=(self.att_hidden_size, self.context_length), # name='kernel_att', # initializer=self.kernel_initializer, # regularizer=self.kernel_regularizer, # constraint=self.kernel_constraint) # self.att_input = Input() self.att_hidden_layer = Dense(self.att_hidden_size, activation='relu', name="att_hidden_layer") self.att_output_layer = Dense(self.context_length, activation='softmax', name="att_output_layer") self.att_reshape_layer = Reshape( (self.context_length * self.att_hidden_size, )) with K.name_scope(self.att_hidden_layer.name): self.att_hidden_layer.build(input_shape) with K.name_scope(self.att_output_layer.name): self.att_output_layer.build(input_shape) with K.name_scope(self.att_reshape_layer.name): self.att_reshape_layer.build(input_shape) super(AttLSTMCell, self).build(input_shape=input_shape) self.built = True def attention(self, inputs, s_tm1): """ :param inputs:[batch_size, input_dim] :param s_tm1:[batch_size, units_num] imposing attention on encoder embedding weights=attention([input1, input2,...], c(t-1)) input = inputs * weights """ if self.att_type == "bahdanau": x = self.attention_bahdanau(self.context, s_tm1) elif self.att_type == "cosine": x = self.attention_cosine(self.context, s_tm1) else: raise ("unsupported attention type:{}".format(self.att_type)) # tf.summary.histogram("summary_name", weights) return x def attention_bahdanau(self, context, s_tm1): s_tm1_seq = K.repeat(s_tm1, self.context_length) att_x = K.concatenate([context, s_tm1_seq]) att_hidden = self.att_hidden_layer(att_x) att_hidden = self.att_reshape_layer(att_hidden) weights = self.att_output_layer(att_hidden) x = K.batch_dot(weights, context, axes=[1, 1]) return x def attention_cosine(self, context, s_tm1): return x def call(self, inputs, states, training=None): ''' inputs: [batch_size, embedding_dim] states: [h(t-1), c(t-1)] ''' s_tm1 = states[0] inputs = self.attention(inputs, s_tm1) return super(AttLSTMCell, self).call(inputs, states, training) def get_weights(self): return self.att_hidden_layer.get_weights( ) + self.att_output_layer.get_weights() @property def trainable_weights(self): weights = [] if hasattr(super(AttLSTMCell, self), 'trainable_weights'): weights += super(AttLSTMCell, self).trainable_weights if hasattr(self.att_hidden_layer, 'trainable_weights'): weights += self.att_hidden_layer.trainable_weights if hasattr(self.att_output_layer, 'trainable_weights'): weights += self.att_output_layer.trainable_weights return weights @property def non_trainable_weights(self): weights = [] if hasattr(super(AttLSTMCell, self), 'non_trainable_weights'): weights += super(AttLSTMCell, self).non_trainable_weights if hasattr(self.att_hidden_layer, 'non_trainable_weights'): weights += self.att_hidden_layer.non_trainable_weights if hasattr(self.att_output_layer, 'non_trainable_weights'): weights += self.att_output_layer.non_trainable_weights return weights @property def updates(self): updates = [] if hasattr(super(AttLSTMCell, self), 'updates'): weights += super(AttLSTMCell, self).updates if hasattr(self.att_hidden_layer, 'updates'): updates += self.att_hidden_layer.updates if hasattr(self.att_output_layer, 'updates'): updates += self.att_output_layer.updates return updates def get_config(self): config = { 'input_lengths': self.input_lengths, 'input_len': self.input_len } base_config = super(MultiLSTMCell, self).get_config() return dict(list(base_config.items()) + list(config.items()))
class CNNEncoder(Layer): ''' CNNEncoder is a combination of multiple convolution layers and max pooling layers. This is defined as a single layer to be consistent with the other encoders in terms of input and output specifications. The input to this "layer" is of shape (batch_size, num_words, embedding_size) and the output is of size (batch_size, output_dim). The CNN has one convolution layer per each ngram filter size. Each convolution operation gives out a vector of size num_filters. The number of times a convolution layer will be used depends on the ngram size: input_length - ngram_size + 1. The corresponding maxpooling layer aggregates all these outputs from the convolution layer and outputs the max. This operation is repeated for every ngram size passed, and consequently the dimensionality of the output after maxpooling is len(ngram_filter_sizes) * num_filters. We then use a fully connected layer to project in back to the desired output_dim. For more details, refer to "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1. ''' def __init__(self, weights=None, **kwargs): self.supports_masking = True # This is the output dim for each convolutional layer, which is the same as the number of # "filters" learned by that layer. self.num_filters = kwargs.pop('num_filters') # This specifies both the number of convolutional layers we will create and their sizes. # Must be a List[int]. The default of (2, 3, 4, 5) will have four convolutional layers, # corresponding to encoding ngrams of size 2 to 5 with some number of filters. ngram_filter_sizes = kwargs.pop('ngram_filter_sizes', (2, 3, 4, 5)) self.ngram_filter_sizes = ngram_filter_sizes self.output_dim = kwargs.pop('output_dim') conv_layer_activation = kwargs.pop('conv_layer_activation', 'relu') self.conv_layer_activation = conv_layer_activation self.l1_regularization = kwargs.pop("l1_regularization", None) self.l2_regularization = kwargs.pop("l2_regularization", None) self.regularizer = lambda: l1l2(l1=self.l1_regularization, l2=self.l2_regularization) # These are member variables that will be defined during self.build(). self.convolution_layers = None self.max_pooling_layers = None self.projection_layer = None self.input_spec = [InputSpec(ndim=3)] self.initial_weights = weights super(CNNEncoder, self).__init__(**kwargs) def build(self, input_shape): input_length = input_shape[1] # number of words # We define convolution, maxpooling and dense layers first. self.convolution_layers = [ Convolution1D(nb_filter=self.num_filters, filter_length=ngram_size, activation=self.conv_layer_activation, W_regularizer=self.regularizer(), b_regularizer=self.regularizer()) for ngram_size in self.ngram_filter_sizes ] self.max_pooling_layers = [ MaxPooling1D(pool_length=input_length - ngram_size + 1) for ngram_size in self.ngram_filter_sizes ] self.projection_layer = Dense(self.output_dim) # Building all layers because these sub-layers are not explitly part of the computatonal graph. for convolution_layer, max_pooling_layer in zip( self.convolution_layers, self.max_pooling_layers): convolution_layer.build(input_shape) max_pooling_layer.build( convolution_layer.get_output_shape_for(input_shape)) maxpool_output_dim = self.num_filters * len(self.ngram_filter_sizes) projection_input_shape = (input_shape[0], maxpool_output_dim) self.projection_layer.build(projection_input_shape) # Defining the weights of this "layer" as the set of weights from all convolution # and maxpooling layers. self.trainable_weights = [] for layer in self.convolution_layers + self.max_pooling_layers + [ self.projection_layer ]: self.trainable_weights.extend(layer.trainable_weights) if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weights super(CNNEncoder, self).build(input_shape) def call(self, x, mask=None): # Each convolution layer returns output of size (samples, pool_length, num_filters), # where pool_length = num_words - ngram_size + 1 # Each maxpooling layer returns output of size (samples, 1, num_filters). # We need to flatten to remove the second dimension of length 1 from the maxpooled output. filter_outputs = [ K.batch_flatten( max_pooling_layer.call(convolution_layer.call(x, mask))) for max_pooling_layer, convolution_layer in zip( self.max_pooling_layers, self.convolution_layers) ] maxpool_output = merge( filter_outputs, mode='concat') if len(filter_outputs) > 1 else filter_outputs[0] return self.projection_layer.call(maxpool_output) def get_output_shape_for(self, input_shape): return (input_shape[0], self.output_dim) def compute_mask(self, input, input_mask=None): # pylint: disable=redefined-builtin # By default Keras propagates the mask from a layer that supports masking. We don't need it # anymore. So eliminating it from the flow. return None def get_config(self): config = { "output_dim": self.output_dim, "num_filters": self.num_filters, "ngram_filter_sizes": self.ngram_filter_sizes, "conv_layer_activation": self.conv_layer_activation, "l1_regularization": self.l1_regularization, "l2_regularization": self.l2_regularization, } base_config = super(CNNEncoder, self).get_config() config.update(base_config) return config
class AdaptiveInstanceLayerNormalization(Layer): # creating a layer class in keras def __init__(self, smoothing=True, light=False, **kwargs): super(AdaptiveInstanceLayerNormalization, self).__init__(**kwargs) self.smoothing = smoothing self.light = light def build(self, input_shape): # initialize weight matrix for each capsule in lower layer self.W = self.add_weight(shape=[input_shape[-1]], initializer=Ones(), name='weights', constraint=MinMaxNorm()) self.latent_size = input_shape[-1] # TODO: (local)Conv2D with high stride before dense? This is way to inefficient, no wonder UGATIT is 2G input_prod = np.prod(input_shape[1:]) self.fc_gamma = Dense(input_shape[-1]) self.fc_gamma.build((None, input_prod)) self.fc_beta = Dense(input_shape[-1]) self.fc_beta.build((None, input_prod)) self.flatten = Flatten() self.flatten.build(input_shape) self.trainable_weights.extend(self.fc_beta.trainable_weights) self.trainable_weights.extend(self.fc_gamma.trainable_weights) self.built = True def call(self, inputs): x = inputs # if self.light: # x = GlobalAveragePooling2D()(x) # Note: Original had 2 fc before this gamma = self.flatten(x) gamma = self.fc_gamma(gamma) gamma = K.reshape(gamma, (-1, 1, 1, self.latent_size)) beta = self.flatten(x) beta = self.fc_beta(beta) beta = K.reshape(beta, (-1, 1, 1, self.latent_size)) eps = 1e-5 ins_mean, ins_sigma = tf.nn.moments(x, axes=[1, 2], keep_dims=True) x_ins = (x - ins_mean) / K.sqrt(ins_sigma + eps) ln_mean, ln_sigma = tf.nn.moments(x, axes=[1, 2, 3], keep_dims=True) x_ln = (x - ln_mean) / K.sqrt(ln_sigma + eps) rho = self.W if self.smoothing: rho = K.clip(rho - K.constant(0.1), 0.0, 1.0) x_hat = rho * x_ins + (1 - rho) * x_ln x_hat = x_hat * gamma + beta return x_hat def compute_output_shape(self, input_shape): return input_shape
class CNNEncoder(MaskedLayer): ''' CNNEncoder is a combination of multiple convolution layers and max pooling layers. This is defined as a single layer to be consistent with the other encoders in terms of input and output specifications. The input to this "layer" is of shape (batch_size, num_words, embedding_dim) and the output is of size (batch_size, output_dim). The CNN has one convolution layer per each ngram filter size. Each convolution operation gives out a vector of size num_filters. The number of times a convolution layer will be used depends on the ngram size: input_length - ngram_size + 1. The corresponding maxpooling layer aggregates all these outputs from the convolution layer and outputs the max. This operation is repeated for every ngram size passed, and consequently the dimensionality of the output after maxpooling is len(ngram_filter_sizes) * num_filters. We then use a fully connected layer to project in back to the desired output_dim. For more details, refer to "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1. Parameters ---------- units: int After doing convolutions, we'll project the collected features into a vector of this size. This used to be ``output_dim``, but Keras changed it to ``units``. I prefer the name ``output_dim``, so we'll leave the code using ``output_dim``, and just use the name ``units`` in the external API. num_filters: int This is the output dim for each convolutional layer, which is the same as the number of "filters" learned by that layer. ngram_filter_sizes: Tuple[int], optional (default=(2, 3, 4, 5)) This specifies both the number of convolutional layers we will create and their sizes. The default of (2, 3, 4, 5) will have four convolutional layers, corresponding to encoding ngrams of size 2 to 5 with some number of filters. conv_layer_activation: str, optional (default='relu') l1_regularization: float, optional (default=None) l2_regularization: float, optional (default=None) ''' def __init__(self, units: int, num_filters: int, ngram_filter_sizes: Tuple[int] = (2, 3, 4, 5), conv_layer_activation: str = 'relu', l1_regularization: float = None, l2_regularization: float = None, **kwargs): self.num_filters = num_filters self.ngram_filter_sizes = ngram_filter_sizes self.output_dim = units self.conv_layer_activation = conv_layer_activation self.l1_regularization = l1_regularization self.l2_regularization = l2_regularization self.regularizer = lambda: l1_l2(l1=self.l1_regularization, l2=self.l2_regularization) # These are member variables that will be defined during self.build(). self.convolution_layers = None self.max_pooling_layers = None self.projection_layer = None self.input_spec = [InputSpec(ndim=3)] super(CNNEncoder, self).__init__(**kwargs) @overrides def build(self, input_shape): input_length = input_shape[1] # number of words # We define convolution, maxpooling and dense layers first. self.convolution_layers = [ Convolution1D(filters=self.num_filters, kernel_size=ngram_size, activation=self.conv_layer_activation, kernel_regularizer=self.regularizer(), bias_regularizer=self.regularizer()) for ngram_size in self.ngram_filter_sizes ] self.max_pooling_layers = [ MaxPooling1D(pool_length=input_length - ngram_size + 1) for ngram_size in self.ngram_filter_sizes ] self.projection_layer = Dense(self.output_dim) # Building all layers because these sub-layers are not explitly part of the computatonal graph. for convolution_layer, max_pooling_layer in zip( self.convolution_layers, self.max_pooling_layers): with K.name_scope(convolution_layer.name): convolution_layer.build(input_shape) with K.name_scope(max_pooling_layer.name): max_pooling_layer.build( convolution_layer.compute_output_shape(input_shape)) maxpool_output_dim = self.num_filters * len(self.ngram_filter_sizes) projection_input_shape = (input_shape[0], maxpool_output_dim) with K.name_scope(self.projection_layer.name): self.projection_layer.build(projection_input_shape) # Defining the weights of this "layer" as the set of weights from all convolution # and maxpooling layers. self.trainable_weights = [] for layer in self.convolution_layers + self.max_pooling_layers + [ self.projection_layer ]: self.trainable_weights.extend(layer.trainable_weights) super(CNNEncoder, self).build(input_shape) @overrides def call(self, inputs, mask=None): # pylint: disable=unused-argument # Each convolution layer returns output of size (samples, pool_length, num_filters), # where pool_length = num_words - ngram_size + 1 # Each maxpooling layer returns output of size (samples, 1, num_filters). # We need to flatten to remove the second dimension of length 1 from the maxpooled output. # TODO(matt): we need to use a convolutional layer here that supports masking. filter_outputs = [ K.batch_flatten( max_pooling_layer.call(convolution_layer.call(inputs))) for max_pooling_layer, convolution_layer in zip( self.max_pooling_layers, self.convolution_layers) ] if K.backend() == 'theano': # Just using the `call` method on layers does not set the _keras_shape, which is # necessary with the theano backend. So we set it manually here to what we expect the # shape to be. for filter_output in filter_outputs: filter_output._keras_shape = (None, self.num_filters) # pylint: disable=protected-access maxpool_output = Concatenate()( filter_outputs) if len(filter_outputs) > 1 else filter_outputs[0] return self.projection_layer.call(maxpool_output) @overrides def compute_output_shape(self, input_shape): return (input_shape[0], self.output_dim) @overrides def compute_mask(self, inputs, mask=None): # pylint: disable=unused-argument # By default Keras propagates the mask from a layer that supports masking. We don't need it # anymore. So eliminating it from the flow. return None @overrides def get_config(self): config = { "units": self.output_dim, "num_filters": self.num_filters, "ngram_filter_sizes": self.ngram_filter_sizes, "conv_layer_activation": self.conv_layer_activation, "l1_regularization": self.l1_regularization, "l2_regularization": self.l2_regularization, } base_config = super(CNNEncoder, self).get_config() config.update(base_config) return config