def get_split_averages(input_tensor, input_mask, indices):
        # Splits input tensor into three parts based on the indices and
        # returns average of values prior to index, values at the index and
        # average of values after the index.
        # input_tensor: (batch_size, input_length, input_dim)
        # input_mask: (batch_size, input_length)
        # indices: (batch_size, 1)
        # (1, input_length)
        length_range = K.expand_dims(K.arange(K.shape(input_tensor)[1]), dim=0)
        # (batch_size, input_length)
        batched_range = K.repeat_elements(length_range, K.shape(input_tensor)[0], 0)
        tiled_indices = K.repeat_elements(indices, K.shape(input_tensor)[1], 1)  # (batch_size, input_length)
        greater_mask = K.greater(batched_range, tiled_indices)  # (batch_size, input_length)
        lesser_mask = K.lesser(batched_range, tiled_indices)  # (batch_size, input_length)
        equal_mask = K.equal(batched_range, tiled_indices)  # (batch_size, input_length)

        # We also need to mask these masks using the input mask.
        # (batch_size, input_length)
        if input_mask is not None:
            greater_mask = switch(input_mask, greater_mask, K.zeros_like(greater_mask))
            lesser_mask = switch(input_mask, lesser_mask, K.zeros_like(lesser_mask))

        post_sum = K.sum(switch(K.expand_dims(greater_mask), input_tensor, K.zeros_like(input_tensor)), axis=1)  # (batch_size, input_dim)
        pre_sum = K.sum(switch(K.expand_dims(lesser_mask), input_tensor, K.zeros_like(input_tensor)), axis=1)  # (batch_size, input_dim)
        values_at_indices = K.sum(switch(K.expand_dims(equal_mask), input_tensor, K.zeros_like(input_tensor)), axis=1)  # (batch_size, input_dim)

        post_normalizer = K.expand_dims(K.sum(greater_mask, axis=1) + K.epsilon(), dim=1)  # (batch_size, 1)
        pre_normalizer = K.expand_dims(K.sum(lesser_mask, axis=1) + K.epsilon(), dim=1)  # (batch_size, 1)

        return K.cast(pre_sum / pre_normalizer, 'float32'), values_at_indices, K.cast(post_sum / post_normalizer, 'float32')
Beispiel #2
0
 def call(self, x, mask=None):
     mean = super(IntraAttention, self).call(x, mask)
     # x: (batch_size, input_length, input_dim)
     # mean: (batch_size, input_dim)
     ones = K.expand_dims(K.mean(K.ones_like(x), axis=(0, 2)),
                          dim=0)  # (1, input_length)
     # (batch_size, input_length, input_dim)
     tiled_mean = K.permute_dimensions(K.dot(K.expand_dims(mean), ones),
                                       (0, 2, 1))
     if mask is not None:
         if K.ndim(mask) > K.ndim(x):
             # Assuming this is because of the bug in Bidirectional. Temporary fix follows.
             # TODO: Fix Bidirectional.
             mask = K.any(mask, axis=(-2, -1))
         if K.ndim(mask) < K.ndim(x):
             mask = K.expand_dims(mask)
         x = switch(mask, x, K.zeros_like(x))
     # (batch_size, input_length, proj_dim)
     projected_combination = K.tanh(
         K.dot(x, self.vector_projector) +
         K.dot(tiled_mean, self.mean_projector))
     scores = K.dot(projected_combination,
                    self.scorer)  # (batch_size, input_length)
     weights = K.softmax(scores)  # (batch_size, input_length)
     attended_x = K.sum(K.expand_dims(weights) * x,
                        axis=1)  # (batch_size, input_dim)
     return attended_x
Beispiel #3
0
 def call(self, x, mask=None):
     # x: (batch_size, input_length, input_dim)
     if mask is None:
         return K.mean(x, axis=1)  # (batch_size, input_dim)
     else:
         # This is to remove padding from the computational graph.
         if K.ndim(mask) > K.ndim(x):
             # This is due to the bug in Bidirectional that is passing the input mask
             # instead of computing output mask.
             # TODO: Fix the implementation of Bidirectional.
             mask = K.any(mask, axis=(-2, -1))
         if K.ndim(mask) < K.ndim(x):
             mask = K.expand_dims(mask)
         masked_input = switch(mask, x, K.zeros_like(x))
         weights = K.cast(mask / (K.sum(mask) + K.epsilon()), 'float32')
         return K.sum(masked_input * weights, axis=1)  # (batch_size, input_dim)
Beispiel #4
0
 def call(self, x, mask=None):
     # x: (batch_size, input_length, input_dim)
     if mask is None:
         return K.mean(x, axis=1)  # (batch_size, input_dim)
     else:
         # This is to remove padding from the computational graph.
         if K.ndim(mask) > K.ndim(x):
             # This is due to the bug in Bidirectional that is passing the input mask
             # instead of computing output mask.
             # TODO: Fix the implementation of Bidirectional.
             mask = K.any(mask, axis=(-2, -1))
         if K.ndim(mask) < K.ndim(x):
             mask = K.expand_dims(mask)
         masked_input = switch(mask, x, K.zeros_like(x))
         weights = K.cast(mask / (K.sum(mask) + K.epsilon()), 'float32')
         return K.sum(masked_input * weights,
                      axis=1)  # (batch_size, input_dim)
 def call(self, x, mask=None):
     # x: (batch_size, input_length, input_dim) where input_length = head_size + 2
     head_encoding = x[:, :-2, :]  # (batch_size, head_size, input_dim)
     prep_encoding = x[:, -2, :]  # (batch_size, input_dim)
     child_encoding = x[:, -1, :]  # (batch_size, input_dim)
     if self.composition_type == 'HPCD':
         # TODO: The following line may not work with TF.
         # (batch_size, head_size, input_dim, 1) * (1, head_size, input_dim, proj_dim)
         head_proj_prod = K.expand_dims(head_encoding) * K.expand_dims(self.dist_proj_head, dim=0)
         head_projection = K.sum(head_proj_prod, axis=2)  # (batch_size, head_size, proj_dim)
     else:
         head_projection = K.dot(head_encoding, self.proj_head)  # (batch_size, head_size, proj_dim)
     prep_projection = K.expand_dims(K.dot(prep_encoding, self.proj_prep), dim=1)  # (batch_size, 1, proj_dim)
     child_projection = K.expand_dims(K.dot(child_encoding, self.proj_child), dim=1)  # (batch_size, 1, proj_dim)
     #(batch_size, head_size, proj_dim)
     if self.composition_type == 'HPCT':
         composed_projection = K.tanh(head_projection + prep_projection + child_projection)
     elif self.composition_type == 'HPC' or self.composition_type == "HPCD":
         prep_child_projection = K.tanh(prep_projection + child_projection)  # (batch_size, 1, proj_dim)
         composed_projection = K.tanh(head_projection + prep_child_projection)
     else:
         # Composition type in HC
         composed_projection = K.tanh(head_projection + child_projection)
     for hidden_layer in self.hidden_layers:
         composed_projection = K.tanh(K.dot(composed_projection, hidden_layer))  # (batch_size, head_size, proj_dim)
     # (batch_size, head_size)
     head_word_scores = K.squeeze(K.dot(composed_projection, self.scorer), axis=-1)
     if mask is None:
         attachment_probabilities = K.softmax(head_word_scores)  # (batch_size, head_size)
     else:
         if K.ndim(mask) > 2:
             # This means this layer came after a Bidirectional layer. Keras has this bug which
             # concatenates input masks instead of output masks.
             # TODO: Fix Bidirectional instead.
             mask = K.any(mask, axis=(-2, -1))
         # We need to do a masked softmax.
         exp_scores = K.exp(head_word_scores)  # (batch_size, head_size)
         head_mask = mask[:, :-2]  # (batch_size, head_size)
         # (batch_size, head_size)
         masked_exp_scores = switch(head_mask, exp_scores, K.zeros_like(head_encoding[:, :, 0]))
         # (batch_size, 1). Adding epsilon to avoid divison by 0. But epsilon is float64.
         exp_sum = K.cast(K.expand_dims(K.sum(masked_exp_scores, axis=1) + K.epsilon()), 'float32')
         attachment_probabilities = masked_exp_scores / exp_sum  # (batch_size, head_size)
     return attachment_probabilities
Beispiel #6
0
 def call(self, x, mask=None):
     mean = super(IntraAttention, self).call(x, mask)
     # x: (batch_size, input_length, input_dim)
     # mean: (batch_size, input_dim)
     ones = K.expand_dims(K.mean(K.ones_like(x), axis=(0, 2)), dim=0)  # (1, input_length)
     # (batch_size, input_length, input_dim)
     tiled_mean = K.permute_dimensions(K.dot(K.expand_dims(mean), ones), (0, 2, 1))
     if mask is not None:
         if K.ndim(mask) > K.ndim(x):
             # Assuming this is because of the bug in Bidirectional. Temporary fix follows.
             # TODO: Fix Bidirectional.
             mask = K.any(mask, axis=(-2, -1))
         if K.ndim(mask) < K.ndim(x):
             mask = K.expand_dims(mask)
         x = switch(mask, x, K.zeros_like(x))
     # (batch_size, input_length, proj_dim)
     projected_combination = K.tanh(K.dot(x, self.vector_projector) + K.dot(tiled_mean, self.mean_projector))
     scores = K.dot(projected_combination, self.scorer)  # (batch_size, input_length)
     weights = K.softmax(scores)  # (batch_size, input_length)
     attended_x = K.sum(K.expand_dims(weights) * x, axis=1)  # (batch_size, input_dim)
     return attended_x
Beispiel #7
0
    def _step(self, x_onto_aware, states):
        h_tm1 = states[0]
        mask_i = states[-1]  # (samples, senses, hyps, 1)
        lstm_states = states[:-1]

        # Before the step function is called, the original input is dimshuffled to have (time, samples, senses, hyps, concept_dim)
        # So shape of x_onto_aware is (samples, senses, hyps, concept_dim + 1), +1 for sense prior parameter
        # TODO: Use sense priors even when not using attention?
        x_synset_embeddings = x_onto_aware[:,:,:,:-1]  # (samples, senses, hyps, embedding_dim)

        # Sense probability calculation
        # Taking only the last dimension from all samples. These are the lambda values of exp distributions.
        sense_parameters = K.expand_dims(x_onto_aware[:, 0, 0, -1])  # (samples,1)
        # (1, num_senses)
        sense_indices = K.variable(K.cast_to_floatx([[ind for ind in range(self.num_senses)]]))
        # (samples, num_senses)
        expanded_sense_indices = K.dot(K.ones_like(sense_parameters), sense_indices)
        # Getting the sense probabilities from the exponential distribution. p(x) = \lambda * e^(-\lambda * x)
        sense_scores = sense_parameters * K.exp(-sense_parameters * expanded_sense_indices)  # (samples, num_senses)
        # If sense priors were not set by the embedding layer, the sense_parameters will be zero, making sense 
        # scores zero. What we really need is sense scores being uniform.
        uniform_scores = K.ones_like(sense_scores) * (1. / self.num_senses)
        sense_scores = switch(K.equal(sense_scores, K.zeros_like(sense_scores)), uniform_scores, sense_scores)
        if mask_i is not None:
            sense_mask = K.any(K.squeeze(mask_i, axis=-1), axis=2)  # (samples, sense)
            sense_scores = switch(sense_mask, sense_scores, K.zeros_like(sense_scores))
        # Renormalizing sense scores to make \sum_{num_senses} p(sense | word) = 1
        sense_probabilities = sense_scores / K.expand_dims(K.sum(sense_scores, axis=1) + K.epsilon())  # (samples, num_senses)
        
        if self.use_attention:
             
            # Generalization attention
            input_hyp_projection = K.dot(x_synset_embeddings, self.input_hyp_projector) # (samples, senses, hyps, output_dim)
            context_hyp_projection = K.dot(h_tm1, self.context_hyp_projector) # (samples, output_dim)
            context_hyp_projection_expanded = K.expand_dims(K.expand_dims(context_hyp_projection,
                                                                          dim=1),
                                                            dim=1)  #(samples, 1, 1, output_dim)
            hyp_projection1 = K.tanh(input_hyp_projection + context_hyp_projection_expanded) # (samples, senses, hyps, output_dim)
            hyp_projection2 = K.tanh(K.dot(hyp_projection1, self.hyp_projector2)) # (samples, senses, hyps, output_dim)
            # K.dot doesn't work with tensorflow when one of the arguments is a vector. So expanding and squeezing.
            # (samples, senses, hyps)
            hyp_scores = K.squeeze(K.dot(hyp_projection2, K.expand_dims(self.hyp_scorer)), axis=-1)
            if mask_i is not None:
                hyp_scores = switch(K.squeeze(mask_i, axis=-1), hyp_scores, K.zeros_like(hyp_scores))
            scores_shape = K.shape(hyp_scores)
            # We need to flatten this because we cannot perform softmax on tensors.
            flattened_scores = K.batch_flatten(hyp_scores)  # (samples, senses*hyps)
            hyp_attention = K.reshape(K.softmax(flattened_scores), scores_shape)  # (samples, senses, hyps)
        else:
            # matrix of ones for scores to be consistent (samples, senses, hyps)
            hyp_attention = K.ones_like(x_synset_embeddings)[:, :, :, 0]
            if mask_i is not None:
                hyp_attention = switch(K.squeeze(mask_i, axis=-1), hyp_attention, K.zeros_like(hyp_attention))

        # Renormalizing hyp attention to get p(hyp | sense, word). Summing over hyps.
        hyp_given_sense_attention = hyp_attention / K.expand_dims(K.sum(hyp_attention, axis=2) + K.epsilon())
        # Multiply P(hyp | sense, word) and p(sense|word) . Attention values now sum to 1.
        sense_hyp_attention = hyp_given_sense_attention * K.expand_dims(sense_probabilities)

        if mask_i is not None:
            # Applying the mask on input
            zeros_like_input = K.zeros_like(x_synset_embeddings)  # (samples, senses, hyps, dim)
            x_synset_embeddings = switch(mask_i, x_synset_embeddings, zeros_like_input) 
            
        weighted_product = x_synset_embeddings * K.expand_dims(sense_hyp_attention)  # (samples, senses, hyps, input_dim)
        # Weighted average, summing over senses and hyps
        lstm_input_t = K.sum(weighted_product, axis=(1, 2))  # (samples, input_dim)
        # Now pass the computed lstm_input to LSTM's step function to get current h and c.
        h, [_, c] = super(OntoAttentionLSTM, self).step(lstm_input_t, lstm_states)
        
        return h, c, sense_hyp_attention