def get_split_averages(input_tensor, input_mask, indices): # Splits input tensor into three parts based on the indices and # returns average of values prior to index, values at the index and # average of values after the index. # input_tensor: (batch_size, input_length, input_dim) # input_mask: (batch_size, input_length) # indices: (batch_size, 1) # (1, input_length) length_range = K.expand_dims(K.arange(K.shape(input_tensor)[1]), dim=0) # (batch_size, input_length) batched_range = K.repeat_elements(length_range, K.shape(input_tensor)[0], 0) tiled_indices = K.repeat_elements(indices, K.shape(input_tensor)[1], 1) # (batch_size, input_length) greater_mask = K.greater(batched_range, tiled_indices) # (batch_size, input_length) lesser_mask = K.lesser(batched_range, tiled_indices) # (batch_size, input_length) equal_mask = K.equal(batched_range, tiled_indices) # (batch_size, input_length) # We also need to mask these masks using the input mask. # (batch_size, input_length) if input_mask is not None: greater_mask = switch(input_mask, greater_mask, K.zeros_like(greater_mask)) lesser_mask = switch(input_mask, lesser_mask, K.zeros_like(lesser_mask)) post_sum = K.sum(switch(K.expand_dims(greater_mask), input_tensor, K.zeros_like(input_tensor)), axis=1) # (batch_size, input_dim) pre_sum = K.sum(switch(K.expand_dims(lesser_mask), input_tensor, K.zeros_like(input_tensor)), axis=1) # (batch_size, input_dim) values_at_indices = K.sum(switch(K.expand_dims(equal_mask), input_tensor, K.zeros_like(input_tensor)), axis=1) # (batch_size, input_dim) post_normalizer = K.expand_dims(K.sum(greater_mask, axis=1) + K.epsilon(), dim=1) # (batch_size, 1) pre_normalizer = K.expand_dims(K.sum(lesser_mask, axis=1) + K.epsilon(), dim=1) # (batch_size, 1) return K.cast(pre_sum / pre_normalizer, 'float32'), values_at_indices, K.cast(post_sum / post_normalizer, 'float32')
def call(self, x, mask=None): mean = super(IntraAttention, self).call(x, mask) # x: (batch_size, input_length, input_dim) # mean: (batch_size, input_dim) ones = K.expand_dims(K.mean(K.ones_like(x), axis=(0, 2)), dim=0) # (1, input_length) # (batch_size, input_length, input_dim) tiled_mean = K.permute_dimensions(K.dot(K.expand_dims(mean), ones), (0, 2, 1)) if mask is not None: if K.ndim(mask) > K.ndim(x): # Assuming this is because of the bug in Bidirectional. Temporary fix follows. # TODO: Fix Bidirectional. mask = K.any(mask, axis=(-2, -1)) if K.ndim(mask) < K.ndim(x): mask = K.expand_dims(mask) x = switch(mask, x, K.zeros_like(x)) # (batch_size, input_length, proj_dim) projected_combination = K.tanh( K.dot(x, self.vector_projector) + K.dot(tiled_mean, self.mean_projector)) scores = K.dot(projected_combination, self.scorer) # (batch_size, input_length) weights = K.softmax(scores) # (batch_size, input_length) attended_x = K.sum(K.expand_dims(weights) * x, axis=1) # (batch_size, input_dim) return attended_x
def call(self, x, mask=None): # x: (batch_size, input_length, input_dim) if mask is None: return K.mean(x, axis=1) # (batch_size, input_dim) else: # This is to remove padding from the computational graph. if K.ndim(mask) > K.ndim(x): # This is due to the bug in Bidirectional that is passing the input mask # instead of computing output mask. # TODO: Fix the implementation of Bidirectional. mask = K.any(mask, axis=(-2, -1)) if K.ndim(mask) < K.ndim(x): mask = K.expand_dims(mask) masked_input = switch(mask, x, K.zeros_like(x)) weights = K.cast(mask / (K.sum(mask) + K.epsilon()), 'float32') return K.sum(masked_input * weights, axis=1) # (batch_size, input_dim)
def call(self, x, mask=None): # x: (batch_size, input_length, input_dim) where input_length = head_size + 2 head_encoding = x[:, :-2, :] # (batch_size, head_size, input_dim) prep_encoding = x[:, -2, :] # (batch_size, input_dim) child_encoding = x[:, -1, :] # (batch_size, input_dim) if self.composition_type == 'HPCD': # TODO: The following line may not work with TF. # (batch_size, head_size, input_dim, 1) * (1, head_size, input_dim, proj_dim) head_proj_prod = K.expand_dims(head_encoding) * K.expand_dims(self.dist_proj_head, dim=0) head_projection = K.sum(head_proj_prod, axis=2) # (batch_size, head_size, proj_dim) else: head_projection = K.dot(head_encoding, self.proj_head) # (batch_size, head_size, proj_dim) prep_projection = K.expand_dims(K.dot(prep_encoding, self.proj_prep), dim=1) # (batch_size, 1, proj_dim) child_projection = K.expand_dims(K.dot(child_encoding, self.proj_child), dim=1) # (batch_size, 1, proj_dim) #(batch_size, head_size, proj_dim) if self.composition_type == 'HPCT': composed_projection = K.tanh(head_projection + prep_projection + child_projection) elif self.composition_type == 'HPC' or self.composition_type == "HPCD": prep_child_projection = K.tanh(prep_projection + child_projection) # (batch_size, 1, proj_dim) composed_projection = K.tanh(head_projection + prep_child_projection) else: # Composition type in HC composed_projection = K.tanh(head_projection + child_projection) for hidden_layer in self.hidden_layers: composed_projection = K.tanh(K.dot(composed_projection, hidden_layer)) # (batch_size, head_size, proj_dim) # (batch_size, head_size) head_word_scores = K.squeeze(K.dot(composed_projection, self.scorer), axis=-1) if mask is None: attachment_probabilities = K.softmax(head_word_scores) # (batch_size, head_size) else: if K.ndim(mask) > 2: # This means this layer came after a Bidirectional layer. Keras has this bug which # concatenates input masks instead of output masks. # TODO: Fix Bidirectional instead. mask = K.any(mask, axis=(-2, -1)) # We need to do a masked softmax. exp_scores = K.exp(head_word_scores) # (batch_size, head_size) head_mask = mask[:, :-2] # (batch_size, head_size) # (batch_size, head_size) masked_exp_scores = switch(head_mask, exp_scores, K.zeros_like(head_encoding[:, :, 0])) # (batch_size, 1). Adding epsilon to avoid divison by 0. But epsilon is float64. exp_sum = K.cast(K.expand_dims(K.sum(masked_exp_scores, axis=1) + K.epsilon()), 'float32') attachment_probabilities = masked_exp_scores / exp_sum # (batch_size, head_size) return attachment_probabilities
def call(self, x, mask=None): mean = super(IntraAttention, self).call(x, mask) # x: (batch_size, input_length, input_dim) # mean: (batch_size, input_dim) ones = K.expand_dims(K.mean(K.ones_like(x), axis=(0, 2)), dim=0) # (1, input_length) # (batch_size, input_length, input_dim) tiled_mean = K.permute_dimensions(K.dot(K.expand_dims(mean), ones), (0, 2, 1)) if mask is not None: if K.ndim(mask) > K.ndim(x): # Assuming this is because of the bug in Bidirectional. Temporary fix follows. # TODO: Fix Bidirectional. mask = K.any(mask, axis=(-2, -1)) if K.ndim(mask) < K.ndim(x): mask = K.expand_dims(mask) x = switch(mask, x, K.zeros_like(x)) # (batch_size, input_length, proj_dim) projected_combination = K.tanh(K.dot(x, self.vector_projector) + K.dot(tiled_mean, self.mean_projector)) scores = K.dot(projected_combination, self.scorer) # (batch_size, input_length) weights = K.softmax(scores) # (batch_size, input_length) attended_x = K.sum(K.expand_dims(weights) * x, axis=1) # (batch_size, input_dim) return attended_x
def _step(self, x_onto_aware, states): h_tm1 = states[0] mask_i = states[-1] # (samples, senses, hyps, 1) lstm_states = states[:-1] # Before the step function is called, the original input is dimshuffled to have (time, samples, senses, hyps, concept_dim) # So shape of x_onto_aware is (samples, senses, hyps, concept_dim + 1), +1 for sense prior parameter # TODO: Use sense priors even when not using attention? x_synset_embeddings = x_onto_aware[:,:,:,:-1] # (samples, senses, hyps, embedding_dim) # Sense probability calculation # Taking only the last dimension from all samples. These are the lambda values of exp distributions. sense_parameters = K.expand_dims(x_onto_aware[:, 0, 0, -1]) # (samples,1) # (1, num_senses) sense_indices = K.variable(K.cast_to_floatx([[ind for ind in range(self.num_senses)]])) # (samples, num_senses) expanded_sense_indices = K.dot(K.ones_like(sense_parameters), sense_indices) # Getting the sense probabilities from the exponential distribution. p(x) = \lambda * e^(-\lambda * x) sense_scores = sense_parameters * K.exp(-sense_parameters * expanded_sense_indices) # (samples, num_senses) # If sense priors were not set by the embedding layer, the sense_parameters will be zero, making sense # scores zero. What we really need is sense scores being uniform. uniform_scores = K.ones_like(sense_scores) * (1. / self.num_senses) sense_scores = switch(K.equal(sense_scores, K.zeros_like(sense_scores)), uniform_scores, sense_scores) if mask_i is not None: sense_mask = K.any(K.squeeze(mask_i, axis=-1), axis=2) # (samples, sense) sense_scores = switch(sense_mask, sense_scores, K.zeros_like(sense_scores)) # Renormalizing sense scores to make \sum_{num_senses} p(sense | word) = 1 sense_probabilities = sense_scores / K.expand_dims(K.sum(sense_scores, axis=1) + K.epsilon()) # (samples, num_senses) if self.use_attention: # Generalization attention input_hyp_projection = K.dot(x_synset_embeddings, self.input_hyp_projector) # (samples, senses, hyps, output_dim) context_hyp_projection = K.dot(h_tm1, self.context_hyp_projector) # (samples, output_dim) context_hyp_projection_expanded = K.expand_dims(K.expand_dims(context_hyp_projection, dim=1), dim=1) #(samples, 1, 1, output_dim) hyp_projection1 = K.tanh(input_hyp_projection + context_hyp_projection_expanded) # (samples, senses, hyps, output_dim) hyp_projection2 = K.tanh(K.dot(hyp_projection1, self.hyp_projector2)) # (samples, senses, hyps, output_dim) # K.dot doesn't work with tensorflow when one of the arguments is a vector. So expanding and squeezing. # (samples, senses, hyps) hyp_scores = K.squeeze(K.dot(hyp_projection2, K.expand_dims(self.hyp_scorer)), axis=-1) if mask_i is not None: hyp_scores = switch(K.squeeze(mask_i, axis=-1), hyp_scores, K.zeros_like(hyp_scores)) scores_shape = K.shape(hyp_scores) # We need to flatten this because we cannot perform softmax on tensors. flattened_scores = K.batch_flatten(hyp_scores) # (samples, senses*hyps) hyp_attention = K.reshape(K.softmax(flattened_scores), scores_shape) # (samples, senses, hyps) else: # matrix of ones for scores to be consistent (samples, senses, hyps) hyp_attention = K.ones_like(x_synset_embeddings)[:, :, :, 0] if mask_i is not None: hyp_attention = switch(K.squeeze(mask_i, axis=-1), hyp_attention, K.zeros_like(hyp_attention)) # Renormalizing hyp attention to get p(hyp | sense, word). Summing over hyps. hyp_given_sense_attention = hyp_attention / K.expand_dims(K.sum(hyp_attention, axis=2) + K.epsilon()) # Multiply P(hyp | sense, word) and p(sense|word) . Attention values now sum to 1. sense_hyp_attention = hyp_given_sense_attention * K.expand_dims(sense_probabilities) if mask_i is not None: # Applying the mask on input zeros_like_input = K.zeros_like(x_synset_embeddings) # (samples, senses, hyps, dim) x_synset_embeddings = switch(mask_i, x_synset_embeddings, zeros_like_input) weighted_product = x_synset_embeddings * K.expand_dims(sense_hyp_attention) # (samples, senses, hyps, input_dim) # Weighted average, summing over senses and hyps lstm_input_t = K.sum(weighted_product, axis=(1, 2)) # (samples, input_dim) # Now pass the computed lstm_input to LSTM's step function to get current h and c. h, [_, c] = super(OntoAttentionLSTM, self).step(lstm_input_t, lstm_states) return h, c, sense_hyp_attention