Exemple #1
0
def normalize_mask(x, mask):
    '''Keep the mask align wtih the tensor x

    Arguments: x is a data tensor; mask is a binary tensor
    Rationale: keep mask at same dimensionality as x, but only with a length-1 
               trailing dimension. This ensures broadcastability, which is important
               because inferring shapes is hard and shapes are easy to get wrong. 
    '''
    mask = K.cast(mask, K.floatx())
    while K.ndim(mask) != K.ndim(x):
        if K.ndim(mask) > K.ndim(x):
            mask = K.any(mask, axis=-1)
        elif K.ndim(mask) < K.ndim(x):
            mask = K.expand_dims(mask)
    return K.any(mask, axis=-1, keepdims=True)
Exemple #2
0
 def compute_mask(self, x, mask=None):
     if mask is None or mask.ndim==2:
         return None
     elif mask.ndim==3:
         mask = K.any(mask, axis=(1,2))
     else:
         raise Exception("Unexpected situation")
 def _gen_local_drops(self, count, p):
     # Create a local droppath with at least one path
     arr = self._random_arr(count, p)
     drops = K.switch(
         K.any(arr),
         arr,
         self._arr_with_one(count)
     )
     return drops
Exemple #4
0
    def compute_mask(self, x, mask=None):
        if mask is None:
            return None
        #import pdb
        #pdb.set_trace()
        target_dim = K.ndim(x) - 2
        num_reducing = K.ndim(mask) - target_dim
        if num_reducing:
            axes = tuple([-i for i in range(1,num_reducing+1)])
            mask = K.any(mask, axes)

        return mask
Exemple #5
0
 def call(self, x, mask=None):
     # x: (batch_size, input_length, input_dim)
     if mask is None:
         return K.mean(x, axis=1)  # (batch_size, input_dim)
     else:
         # This is to remove padding from the computational graph.
         if K.ndim(mask) > K.ndim(x):
             # This is due to the bug in Bidirectional that is passing the input mask
             # instead of computing output mask.
             # TODO: Fix the implementation of Bidirectional.
             mask = K.any(mask, axis=(-2, -1))
         if K.ndim(mask) < K.ndim(x):
             mask = K.expand_dims(mask)
         masked_input = switch(mask, x, K.zeros_like(x))
         weights = K.cast(mask / (K.sum(mask) + K.epsilon()), 'float32')
         return K.sum(masked_input * weights, axis=1)  # (batch_size, input_dim)
 def call(self, x, mask=None):
     # x: (batch_size, input_length, input_dim) where input_length = head_size + 2
     head_encoding = x[:, :-2, :]  # (batch_size, head_size, input_dim)
     prep_encoding = x[:, -2, :]  # (batch_size, input_dim)
     child_encoding = x[:, -1, :]  # (batch_size, input_dim)
     if self.composition_type == 'HPCD':
         # TODO: The following line may not work with TF.
         # (batch_size, head_size, input_dim, 1) * (1, head_size, input_dim, proj_dim)
         head_proj_prod = K.expand_dims(head_encoding) * K.expand_dims(self.dist_proj_head, dim=0)
         head_projection = K.sum(head_proj_prod, axis=2)  # (batch_size, head_size, proj_dim)
     else:
         head_projection = K.dot(head_encoding, self.proj_head)  # (batch_size, head_size, proj_dim)
     prep_projection = K.expand_dims(K.dot(prep_encoding, self.proj_prep), dim=1)  # (batch_size, 1, proj_dim)
     child_projection = K.expand_dims(K.dot(child_encoding, self.proj_child), dim=1)  # (batch_size, 1, proj_dim)
     #(batch_size, head_size, proj_dim)
     if self.composition_type == 'HPCT':
         composed_projection = K.tanh(head_projection + prep_projection + child_projection)
     elif self.composition_type == 'HPC' or self.composition_type == "HPCD":
         prep_child_projection = K.tanh(prep_projection + child_projection)  # (batch_size, 1, proj_dim)
         composed_projection = K.tanh(head_projection + prep_child_projection)
     else:
         # Composition type in HC
         composed_projection = K.tanh(head_projection + child_projection)
     for hidden_layer in self.hidden_layers:
         composed_projection = K.tanh(K.dot(composed_projection, hidden_layer))  # (batch_size, head_size, proj_dim)
     # (batch_size, head_size)
     head_word_scores = K.squeeze(K.dot(composed_projection, self.scorer), axis=-1)
     if mask is None:
         attachment_probabilities = K.softmax(head_word_scores)  # (batch_size, head_size)
     else:
         if K.ndim(mask) > 2:
             # This means this layer came after a Bidirectional layer. Keras has this bug which
             # concatenates input masks instead of output masks.
             # TODO: Fix Bidirectional instead.
             mask = K.any(mask, axis=(-2, -1))
         # We need to do a masked softmax.
         exp_scores = K.exp(head_word_scores)  # (batch_size, head_size)
         head_mask = mask[:, :-2]  # (batch_size, head_size)
         # (batch_size, head_size)
         masked_exp_scores = switch(head_mask, exp_scores, K.zeros_like(head_encoding[:, :, 0]))
         # (batch_size, 1). Adding epsilon to avoid divison by 0. But epsilon is float64.
         exp_sum = K.cast(K.expand_dims(K.sum(masked_exp_scores, axis=1) + K.epsilon()), 'float32')
         attachment_probabilities = masked_exp_scores / exp_sum  # (batch_size, head_size)
     return attachment_probabilities
Exemple #7
0
 def call(self, x, mask=None):
     mean = super(IntraAttention, self).call(x, mask)
     # x: (batch_size, input_length, input_dim)
     # mean: (batch_size, input_dim)
     ones = K.expand_dims(K.mean(K.ones_like(x), axis=(0, 2)), dim=0)  # (1, input_length)
     # (batch_size, input_length, input_dim)
     tiled_mean = K.permute_dimensions(K.dot(K.expand_dims(mean), ones), (0, 2, 1))
     if mask is not None:
         if K.ndim(mask) > K.ndim(x):
             # Assuming this is because of the bug in Bidirectional. Temporary fix follows.
             # TODO: Fix Bidirectional.
             mask = K.any(mask, axis=(-2, -1))
         if K.ndim(mask) < K.ndim(x):
             mask = K.expand_dims(mask)
         x = switch(mask, x, K.zeros_like(x))
     # (batch_size, input_length, proj_dim)
     projected_combination = K.tanh(K.dot(x, self.vector_projector) + K.dot(tiled_mean, self.mean_projector))
     scores = K.dot(projected_combination, self.scorer)  # (batch_size, input_length)
     weights = K.softmax(scores)  # (batch_size, input_length)
     attended_x = K.sum(K.expand_dims(weights) * x, axis=1)  # (batch_size, input_dim)
     return attended_x
Exemple #8
0
 def _gen_local_drops(self, count, p):
     # Create a local droppath with at least one path
     arr = self._random_arr(count, p)
     drops = K.switch(K.any(arr), arr, self._arr_with_one(count))
     return drops
def sparse_masked_mlm_loss(y_true, y_pred):
    mask = K.cast(K.any(y_true, axis=-1), "float32")
    cce = K.sparse_categorical_crossentropy(y_true, y_pred)
    masked_cce = mask * cce
    return K.sum(masked_cce) / (K.sum(mask) + K.epsilon())
Exemple #10
0
  def compute_mask(self, inputs, mask=None):
    """Computes an output mask tensor for Embedding layer.

    This is based on the inputs, mask, and the inner layer.
    If batch size is specified:
    Simply return the input `mask`. (An rnn-based implementation with
    more than one rnn inputs is required but not supported in tf.keras yet.)
    Otherwise we call `compute_mask` of the inner layer at each time step.
    If the output mask at each time step is not `None`:
    (E.g., inner layer is Masking or RNN)
    Concatenate all of them and return the concatenation.
    If the output mask at each time step is `None` and the input mask is not
    `None`:(E.g., inner layer is Dense)
    Reduce the input_mask to 2 dimensions and return it.
    Otherwise (both the output mask and the input mask are `None`):
    (E.g., `mask` is not used at all)
    Return `None`.

    Arguments:
      inputs: Tensor with shape [batch size, timesteps, ...] indicating the
        input to TimeDistributed. If static shape information is available for
        "batch size", `mask` is returned unmodified.
      mask: Either None (indicating no masking) or a Tensor indicating the
        input mask for TimeDistributed. The shape can be static or dynamic.

    Returns:
      Either None (no masking), or a [batch size, timesteps, ...] Tensor with
      an output mask for the TimeDistributed layer with the shape beyond the
      second dimension being the value of the input mask shape(if the computed
      output mask is none), an output mask with the shape beyond the first
      dimension being the value of the mask shape(if mask is not None) or
      output mask with the shape beyond the first dimension being the
      value of the computed output shape.

    """
    # cases need to call the layer.compute_mask when input_mask is None:
    # Masking layer and Embedding layer with mask_zero
    input_shape = tf.nest.map_structure(
        lambda x: tf.TensorShape(K.int_shape(x)), inputs)
    input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
    batch_size = tf_utils.convert_shapes(input_shape)
    batch_size = tf.nest.flatten(batch_size)[0]
    is_ragged_input = tf.nest.map_structure(
        lambda x: isinstance(x, tf.RaggedTensor), inputs)
    is_ragged_input = generic_utils.to_list(tf.nest.flatten(is_ragged_input))
    if batch_size and not self._always_use_reshape or any(is_ragged_input):
      # batch size matters, we currently do not handle mask explicitly, or if
      # the layer always uses reshape approach, or the input is a ragged tensor.
      return mask
    inner_mask = mask
    if inner_mask is not None:
      inner_mask_shape = self._get_shape_tuple((-1,), mask, 2)
      inner_mask = K.reshape(inner_mask, inner_mask_shape)
    inner_input_shape = tf.nest.map_structure(
        lambda tensor: self._get_shape_tuple((-1,), tensor, 2), inputs)
    inner_inputs = tf.__internal__.nest.map_structure_up_to(inputs, tf.reshape, inputs,
                                            inner_input_shape)
    output_mask = self.layer.compute_mask(inner_inputs, inner_mask)
    if output_mask is None:
      if mask is None:
        return None
      # input_mask is not None, and output_mask is None:
      # we should return a not-None mask
      output_mask = mask
      for _ in range(2, len(K.int_shape(mask))):
        output_mask = K.any(output_mask, axis=-1)
    else:
      # output_mask is not None. We need to reshape it
      input_length = tf_utils.convert_shapes(input_shape)
      input_length = tf.nest.flatten(input_length)[1]
      if not input_length:
        input_length = tf.nest.map_structure(lambda x: K.shape(x)[1], inputs)
        input_length = tf.nest.flatten(input_length)[0]
      output_mask_int_shape = K.int_shape(output_mask)
      if output_mask_int_shape is None:
        # if the output_mask does not have a static shape,
        # its shape must be the same as mask's
        if mask is not None:
          output_mask_int_shape = K.int_shape(mask)
        else:
          input_shape = generic_utils.to_list(tf.nest.flatten(input_shape))[0]
          output_mask_int_shape = K.compute_output_shape(input_shape)[:-1]
      output_mask_shape = self._get_shape_tuple(
          (-1, input_length), output_mask, 1, output_mask_int_shape[1:])
      output_mask = K.reshape(output_mask, output_mask_shape)
    return output_mask
Exemple #11
0
    def compute_mask(self, inputs, mask=None):
        if mask is not None:
            mask = K.any(mask, axis=-1, keepdims=True)

        return mask
Exemple #12
0
    def get_updates(self, loss, params):

        self.updates = []
        self.updates.append(K.update_add(self.state_counter, 1))
        self.updates.append(K.update_add(self.iterator, 1))
        self.updates.append(K.update_add(self.iterations, 1))

        lr = self.lr
        ## lr exponential decay
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        shapes = [K.int_shape(p) for p in params]
        x = [K.update(K.zeros(shape), p) for shape, p in zip(shapes, params)]
        mu = [K.update(K.zeros(shape), p) for shape, p in zip(shapes, params)]

        grads = self.get_gradients(loss, params)
        moments = [
            K.zeros(shape, name='moment_' + str(i))
            for (i, shape) in enumerate(shapes)
        ]

        for x_i, x_prime_i, mu_i, g, m in zip(x, params, mu, grads, moments):

            ## we update x_prime (if we are in LAngevin steps, we update, otherwise we switch to parameters x_i)
            dx_prime_i = g - self.gamma * (x_i - x_prime_i)
            x_prime_update_i = K.switch(
                K.any(K.stack([
                    K.equal(self.state_counter, 0),
                    K.equal(self.num_steps, self.iterator)
                ],
                              axis=0),
                      axis=0), x_i, x_prime_i - self.sgld_step * dx_prime_i +
                K.sqrt(self.sgld_step) * self.sgld_noise *
                K.random_normal(K.int_shape(x_prime_i)))
            # Apply constraints.
            if getattr(x_prime_i, 'constraint', None) is not None:
                x_prime_update_i = x_prime_i.constraint(x_prime_update_i)
            self.updates.append(K.update(x_prime_i, x_prime_update_i))

            ## We update mu (if we are in LAngevin steps, we update otherwise we switch to parameters x_i)
            mu_update_i = K.switch(K.equal(self.state_counter,
                                           0), x_i, (1 - self.alpha) * mu_i +
                                   self.alpha * x_prime_i)
            self.updates.append(K.update(mu_i, mu_update_i))

            ## As they described in the paper, we remove the gamma from the update because it interferes with the learning annealing
            ## After each outer loop update we apply an exponential decay on gamma
            ## The following lines concerns the outer loop updates

            ## Nesterov's momentum
            gradient = (x_i - mu_i)
            v = self.momentum * m - lr * gradient  # velocity
            self.updates.append(
                K.update(
                    m, K.switch(K.equal(self.state_counter, self.L + 1), v,
                                m)))
            if self.nesterov:
                new_x_i = x_i + self.momentum * v - lr * gradient
            else:
                new_x_i = x_i + v

            x_i_update = K.switch(K.equal(self.state_counter, self.L + 1),
                                  new_x_i, x_i)
            self.updates.append(K.update(x_i, x_i_update))

            ## Gamma scoping
            gamma_update = K.switch(K.equal(self.state_counter,
                                            self.L + 1), self.gamma,
                                    self.gamma * (1. + self.scoping))
            self.updates.append(K.update(self.gamma, gamma_update))

        counter = K.switch(K.equal(self.state_counter, self.L + 2),
                           K.constant(0, dtype='int64'), self.state_counter)
        self.updates.append(K.update(self.state_counter, counter))
        return self.updates
Exemple #13
0
 def compute_mask(self, inputs, mask=None):
     # pylint: disable=unused-argument
     if mask is None:
         return None
     return K.any(mask, axis=self.axis)
Exemple #14
0
 def compute_mask(self, input, mask=None):
     if mask is not None and self.learn_mode == 'join':
         return K.any(mask, axis=1)
     return mask
Exemple #15
0
 def call(self, inputs, **kwargs):
     mask = K.any(K.not_equal(inputs, self.mask_value), axis=-1)
     return K.cast(mask, K.floatx())
Exemple #16
0
def bool_match(y_true, y_pred):
    return K.switch(K.any(y_true - y_pred.round()), K.variable(0),
                    K.variable(1))
Exemple #17
0
def compute_mask(x, mask_value=0):
    boolean_mask = K.any(K.not_equal(x, mask_value), axis=-1, keepdims=False)
    return K.cast(boolean_mask, K.floatx())
Exemple #18
0
 def compute_mask(self, inputs, mask=None):
     masked_cols = K.cast(K.any(K.not_equal(inputs, self.mask_value), axis=-1, keepdims=True), K.floatx())
     masked_rows = K.cast(K.any(K.not_equal(inputs, self.mask_value), axis=-2, keepdims=True), K.floatx())
     return K.batch_dot(masked_cols, masked_rows)
Exemple #19
0
 def call(self, x, mask=None):
   assert mask is None
   return K.cast(K.any(x, axis=-1), K.floatx())
Exemple #20
0
 def call(self, inputs, mask=None):
     boolean_mask = K.any(K.not_equal(inputs, self.mask_value),
                          axis=-1, keepdims=self.keepdims)
     if mask is not None:
         boolean_mask*=mask
     return K.cast(boolean_mask, K.floatx())
Exemple #21
0
def true_label_exists(y_true):
    return K.any(y_true, axis=0)
 def compute_mask(self, x, input_mask=None):
     # pylint: disable=unused-argument
     if input_mask is None:
         return None
     else:
         return K.any(input_mask, axis=-1)
Exemple #23
0
 def compute_mask(self, inputs, mask=None):
     if len(inputs.shape) > 2 and mask is not None:
         mask = K.any(mask, axis=-1, keepdims=False)
     else:  #don't return mask
         return None
Exemple #24
0
 def compute_mask(self, inputs, mask=None):
     return K.any(K.not_equal(inputs, 0.), axis=[2, 3, 4])
Exemple #25
0
 def squash_mask(self, mask):
     if K.ndim(mask) == 2:
         return mask
     elif K.ndim(mask) == 3:
         return K.any(mask, axis=-1)
Exemple #26
0
    def pfn_mask_func(X, mask_val=mask_val):

        # map mask_val to zero and return 1 elsewhere
        return K.cast(K.any(K.not_equal(X, mask_val), axis=-1), K.dtype(X))
Exemple #27
0
 def compute_mask(self, input, mask=None):
     if mask is not None:
         return K.any(mask, axis=1)
     return mask
Exemple #28
0
 def call(self, inputs):
     boolean_mask = K.any(K.not_equal(inputs[1], self.mask_value),
                          axis=-1, keepdims=True)
     return inputs[0] * K.cast(boolean_mask, K.dtype(inputs[0]))
Exemple #29
0
 def compute_mask(self, x, input_mask=None):
     return K.any(K.greater(x, self.mask_value), axis=-1)
Exemple #30
0
def MaskingHack(x):
    #mask = K.repeat_elements( K.any(x[:,:,0:-2], axis=-1), rep=x.shape[-1], axis=-1 )
    mask = K.any(x[:,:,0:-2], axis=-1, keepdims=True)
    return x*mask
def build_model(char_size=27,
                dim=64,
                iterations=4,
                training=True,
                ilp=False,
                pca=False):
    """Build the model."""
    # Inputs
    # Context: (rules, preds, chars,)
    # context = L.Input(shape=(None, None, None,), name='context', dtype='int32')
    # query = L.Input(shape=(None,), name='query', dtype='int32')

    if ilp:
        context, query, templates = ilp

    # Contextual embeddeding of symbols
    # texts = []  # list of text samples
    # id_list = []
    # question_list = []
    # label_list = []
    # labels_index = {}  # dictionary mapping label name to numeric id
    # labels = []  # list of label ids
    # TEXT_DATA_DIR = os.path.abspath('.') + "/data/pararule"
    # # TEXT_DATA_DIR = "D:\\AllenAI\\20_newsgroup"
    # Str = '.jsonl'
    # CONTEXT_TEXTS = []
    # test_str = 'test'
    # meta_str = 'meta'

    # for name in sorted(os.listdir(TEXT_DATA_DIR)):
    #   path = os.path.join(TEXT_DATA_DIR, name)
    #   if os.path.isdir(path):
    #     label_id = len(labels_index)
    #     labels_index[name] = label_id
    #     for fname in sorted(os.listdir(path)):
    #       fpath = os.path.join(path, fname)
    #       if Str in fpath:
    #         if test_str not in fpath:
    #           if meta_str not in fpath:
    #             with open(fpath) as f:
    #               for l in json_lines.reader(f):
    #                 if l["id"] not in id_list:
    #                   id_list.append(l["id"])
    #                   questions = l["questions"]
    #                   context = l["context"].replace("\n", " ")
    #                   context = re.sub(r'\s+', ' ', context)
    #                   CONTEXT_TEXTS.append(context)
    #                   for i in range(len(questions)):
    #                     text = questions[i]["text"]
    #                     label = questions[i]["label"]
    #                     if label == True:
    #                       t = 1
    #                     else:
    #                       t = 0
    #                     q = re.sub(r'\s+', ' ', text)
    #                     texts.append(context)
    #                     question_list.append(q)
    #                     label_list.append(int(t))
    #             f.close()
    #       # labels.append(label_id)

    print('Found %s texts.' % len(CONTEXT_TEXTS))

    # MAX_NB_WORDS = 20000
    # MAX_SEQUENCE_LENGTH = 1000
    # tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
    # tokenizer.fit_on_texts(texts)
    # #sequences = tokenizer.texts_to_sequences(texts)

    word_index = WORD_INDEX
    print('Found %s unique tokens.' % len(word_index))

    #data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    # labels = to_categorical(np.asarray(labels))
    #print('Shape of data tensor:', data.shape)
    # print('Shape of label tensor:', labels.shape)

    # split the data into a training set and a validation set
    # indices = np.arange(data.shape[0])
    # np.random.shuffle(indices)
    # data = data[indices]
    # labels = labels[indices]

    embeddings_index = {}
    GLOVE_DIR = os.path.abspath('.') + "/data/glove"
    f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),
             'r',
             encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))

    EMBEDDING_DIM = 100

    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    embedding_layer = L.Embedding(len(word_index) + 1,
                                  EMBEDDING_DIM,
                                  weights=[embedding_matrix],
                                  trainable=False)

    context = L.Input(shape=(
        None,
        None,
        None,
    ),
                      name='context',
                      dtype='int32')
    query = L.Input(shape=(None, ), name='query', dtype='int32')

    embedded_ctx = embedding_layer(
        context)  # (?, rules, preds, chars, char_size)
    embedded_q = embedding_layer(query)  # (?, chars, char_size)
    #onehot_weights = np.eye(char_size)
    #onehot_weights[0, 0] = 0 # Clear zero index
    # onehot = L.Embedding(char_size, char_size,
    #                      trainable=False,
    #                      weights=[onehot_weights],
    #                      name='onehot')
    # embedded_ctx = onehot(context) # (?, rules, preds, chars, char_size)
    # embedded_q = onehot(query) # (?, chars, char_size)

    if ilp:
        # Combine the templates with the context, (?, rules+temps, preds, chars, char_size)
        embedded_ctx = L.Lambda(lambda xs: K.concatenate(xs, axis=1),
                                name='template_concat')(
                                    [templates, embedded_ctx])
        # embedded_ctx = L.concatenate([templates, embedded_ctx], axis=1)

    embed_pred = ZeroGRU(dim, go_backwards=True, name='embed_pred')
    embedded_predq = embed_pred(embedded_q)  # (?, dim)
    # For every rule, for every predicate, embed the predicate
    embedded_ctx_preds = L.TimeDistributed(L.TimeDistributed(embed_pred,
                                                             name='nest1'),
                                           name='nest2')(embedded_ctx)
    # (?, rules, preds, dim)

    # embed_rule = ZeroGRU(dim, go_backwards=True, name='embed_rule')
    # embedded_rules = NestedTimeDist(embed_rule, name='d_embed_rule')(embedded_ctx_preds)
    get_heads = L.Lambda(lambda x: x[:, :, 0, :], name='rule_heads')
    embedded_rules = get_heads(embedded_ctx_preds)
    # (?, rules, dim)

    # Reused layers over iterations
    repeat_toctx = L.RepeatVector(K.shape(embedded_ctx)[1],
                                  name='repeat_to_ctx')
    diff_sq = L.Lambda(lambda xy: K.square(xy[0] - xy[1]),
                       output_shape=(None, dim),
                       name='diff_sq')
    mult = L.Multiply()
    concat = L.Lambda(lambda xs: K.concatenate(xs, axis=2),
                      output_shape=(None, dim * 5),
                      name='concat')
    att_densel = L.Dense(dim // 2, activation='tanh', name='att_densel')
    att_dense = L.Dense(1, activation='sigmoid', name='att_dense')
    squeeze2 = L.Lambda(lambda x: K.squeeze(x, 2), name='sequeeze2')
    rule_mask = L.Lambda(lambda x: K.cast(
        K.any(K.not_equal(x, 0), axis=-1, keepdims=True), 'float32'),
                         name='rule_mask')(embedded_rules)

    unifier = NestedTimeDist(ZeroGRU(dim, name='unifier'), name='dist_unifier')
    dot11 = L.Dot((1, 1))
    # gating = L.Dense(1, activation='sigmoid', name='gating')
    # gate2 = L.Lambda(lambda xyg: xyg[2]*xyg[0] + (1-xyg[2])*xyg[1], name='gate')

    # Reasoning iterations
    state = embedded_predq
    repeated_q = repeat_toctx(embedded_predq)
    outs = list()
    for _ in range(iterations):
        # Compute attention between rule and query state
        ctx_state = repeat_toctx(state)  # (?, rules, dim)
        s_s_c = diff_sq([ctx_state, embedded_rules])
        s_m_c = mult([embedded_rules, state])  # (?, rules, dim)
        sim_vec = concat([s_s_c, s_m_c, ctx_state, embedded_rules, repeated_q])
        sim_vec = att_densel(sim_vec)  # (?, rules, dim//2)
        sim_vec = att_dense(sim_vec)  # (?, rules, 1)
        sim_vec = mult([sim_vec, rule_mask])
        sim_vec = squeeze2(sim_vec)  # (?, rules)
        # sim_vec = L.Softmax(axis=1)(sim_vec)
        outs.append(sim_vec)

        # Unify every rule and weighted sum based on attention
        new_states = unifier(embedded_ctx_preds, initial_state=[state])
        # (?, rules, dim)
        state = dot11([sim_vec, new_states])

        # Apply gating
        # gate = gating(state)
        # outs.append(gate)
        # state = gate2([state, new_state, gate])

    # Predication
    out = L.Dense(1, activation='sigmoid', name='out')(state)
    if ilp:
        return outs, out
    elif pca:
        model = Model([context, query], [embedded_rules])
    elif training:
        model = Model([context, query], [out])
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['acc'])
    else:
        model = Model([context, query], outs + [out])
    return model
Exemple #32
0
                       name='lstm2'),
                  name='bidir_2'))

#Building Keras MTLSTM model with short-cut fix for keras masking + Bidirectional issue
x = Input(shape=(None, 300))
y = Masking(mask_value=0., input_shape=(None, 300))(x)
y = Bidirectional(LSTM(300,
                       return_sequences=True,
                       recurrent_activation='sigmoid',
                       name='lstm1'),
                  name='bidir_1')(y)
y = Bidirectional(LSTM(300,
                       return_sequences=True,
                       recurrent_activation='sigmoid',
                       name='lstm2'),
                  name='bidir_2')(y)

# These 2 layer are short-cut fix for the issue -
y_rev_mask_fix = Lambda(lambda x: K.cast(
    K.any(K.not_equal(x, 0.), axis=-1, keepdims=True), K.floatx()))(x)
y = Multiply()([y, y_rev_mask_fix])

keras_model = Model(inputs=x, outputs=y)

# Load the Python3 port of the model - MAKE SURE THIS FILE EXISTS BEFORE RUNNING THE SCRIPT - GET IT FROM https://github.com/rgsachin/CoVe
keras_model.load_weights('Keras_CoVe.h5')

# Save a new Python2 port of the model
keras_model.save('Keras_CoVe_Python2.h5')

print("Done")
def masked_loss(y_true, y_pred):
    y_mask = K.cast(K.any(y_true, axis=-1), "float32")
    loss = K.switch(y_mask, K.sparse_categorical_crossentropy(y_true, y_pred),
                    K.zeros_like(y_mask, dtype=K.floatx()))
    return K.sum(loss) / (K.cast(K.sum(y_mask), dtype='float32') + K.epsilon())
Exemple #34
0
def get_seq_length(x):
    return K.sum(K.cast(K.any(K.not_equal(x, 0), axis=-1), K.floatx()),
                 axis=-1)
Exemple #35
0
 def compute_mask(self, inputs, input_mask=None):  # pylint: disable=unused-argument
     options = inputs[2]
     padding_mask = K.not_equal(options, K.zeros_like(options))
     return K.cast(K.any(padding_mask, axis=2), "float32")
 def _gen_local_drops(self, count, p):
     arr = self._random_arr(count, p)
     drops = K.switch(K.any(arr), arr, self._arr_with_one(count))
     
     return drops
 def compute_mask(self, inputs, mask=None):
     return K.any(K.not_equal(inputs, 0.), axis=[2, 3, 4])
Exemple #38
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)

        # first update the number of iterations
        self.updates = [K.update_add(self.iterations, 1)]

        if self.decay_epochs:
            ite_casted = K.cast(self.iterations, K.dtype(self.decay_epochs))
            hit_decay_epoch = K.any(K.equal(ite_casted, self.decay_epochs))

            print(hit_decay_epoch)
            lr = K.switch(hit_decay_epoch, self.lr['all'] * self.decay['all'],
                          self.lr['all'])

            K.print_tensor(self.lr['all'])
            #a = K.switch(hit_decay_epoch,
            #             K.print_tensor(self.lr['all'],message='Decays:'),
            #             K.print_tensor(self.lr['all'],message=' '))

            self.updates.append(K.update(self.lr['all'], lr))

        shapes = [K.int_shape(p) for p in params]
        moments = [K.zeros(s) for s in shapes]
        self.weights = [self.iterations] + moments
        print(self.weights)

        for p, g, m in zip(params, grads, moments):
            #print("HEREEEE:", p.name, g, m)
            if p.name in self.lr.keys():
                if self.verbose > 0:
                    print("Setting different learning rate for ", p.name,
                          " : ", K.eval(self.lr[p.name]))
                lr = self.lr[p.name]
                if self.decay_epochs and p.name in self.decay.keys():
                    lr = K.switch(hit_decay_epoch,
                                  self.lr[p.name] * self.decay[p.name],
                                  self.lr[p.name])
                    self.updates.append(K.update(self.lr[p.name], lr))
                    if self.verbose > 0:
                        print("Added decay to ", p.name, ": ", K.eval(lr), ",",
                              self.decay[p.name])
                elif self.decay_epochs:
                    lr = K.switch(hit_decay_epoch,
                                  self.lr[p.name] * self.decay['all'],
                                  self.lr[p.name])
                    self.updates.append(K.update(self.lr[p.name], lr))
                    if self.verbose > 0:
                        print("Added decay to ", p.name, ": ", K.eval(lr), ",",
                              self.decay['all'])
                else:
                    lr = self.lr[p.name]

            else:
                lr = self.lr['all']

            if p.name in self.momentum.keys():
                if self.verbose > 0:
                    print("Setting different momentum for ", p.name, " , ",
                          K.eval(self.momentum[p.name]))
                momentum = self.momentum[p.name]
            else:
                momentum = self.momentum['all']

            v = momentum * m - lr * g  # velocity
            self.updates.append(K.update(m, v))

            if self.nesterov:
                new_p = p + momentum * (momentum * m - lr * g) - lr * g
            else:
                new_p = p + momentum * m - lr * g

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            if self.clips_val and (p.name in self.clips.keys()):
                if self.verbose > 0:
                    print("Clipping variable", p.name, " to ",
                          self.clips[p.name])
                c = K.eval(self.clips[p.name])
                new_p = K.clip(new_p, c[0], c[1])
            print("updates for ", p.name, " lr: ", K.eval(lr), " mom:",
                  K.eval(momentum))
            self.updates.append(K.update(p, new_p))
        return self.updates
Exemple #39
0
def build_model(char_size=27, dim=64, iterations=4, training=True, pca=False):
    """Build the model."""
    # Inputs
    # Context: (rules, preds, chars,)
    context = L.Input(shape=(
        None,
        None,
        None,
    ),
                      name='context',
                      dtype='int32')
    query = L.Input(shape=(None, ), name='query', dtype='int32')

    # Flatten preds to embed entire rules
    var_flat = L.Lambda(lambda x: K.reshape(
        x, K.stack([K.shape(x)[0], -1,
                    K.prod(K.shape(x)[2:])])),
                        name='var_flat')
    flat_ctx = var_flat(context)  # (?, rules, preds*chars)

    # Onehot embedding
    # Contextual embeddeding of symbols
    onehot_weights = np.eye(char_size)
    onehot_weights[0, 0] = 0  # Clear zero index
    onehot = L.Embedding(char_size,
                         char_size,
                         trainable=False,
                         weights=[onehot_weights],
                         name='onehot')
    embedded_ctx = onehot(flat_ctx)  # (?, rules, preds*chars*char_size)
    embedded_q = onehot(query)  # (?, chars, char_size)

    embed_pred = ZeroGRU(dim, go_backwards=True, name='embed_pred')
    embedded_predq = embed_pred(embedded_q)  # (?, dim)
    # Embed every rule
    embedded_rules = NestedTimeDist(embed_pred,
                                    name='rule_embed')(embedded_ctx)
    # (?, rules, dim)

    # Reused layers over iterations
    repeat_toctx = L.RepeatVector(K.shape(embedded_ctx)[1],
                                  name='repeat_to_ctx')
    diff_sq = L.Lambda(lambda xy: K.square(xy[0] - xy[1]),
                       output_shape=(None, dim),
                       name='diff_sq')
    concat = L.Lambda(lambda xs: K.concatenate(xs, axis=2),
                      output_shape=(None, dim * 5),
                      name='concat')
    att_dense1 = L.TimeDistributed(L.Dense(dim,
                                           activation='tanh',
                                           name='att_dense1'),
                                   name='d_att_dense1')
    att_dense2 = L.TimeDistributed(L.Dense(1,
                                           activation='sigmoid',
                                           name='att_dense2'),
                                   name='d_att_dense2')
    squeeze2 = L.Lambda(lambda x: K.squeeze(x, 2), name='sequeeze2')
    # expand = L.Lambda(lambda x: K.expand_dims(x, axis=2), name='expand')
    rule_mask = L.Lambda(lambda x: K.cast(
        K.any(K.not_equal(x, 0), axis=-1, keepdims=True), 'float32'),
                         name='rule_mask')(embedded_rules)
    episodic_mem = EpisodicMemory(dim, name='episodic_mem')

    # Reasoning iterations
    state = embedded_predq
    repeated_q = repeat_toctx(embedded_predq)
    outs = list()
    for _ in range(iterations):
        # Compute attention between rule and query state
        ctx_state = repeat_toctx(state)  # (?, rules, dim)
        s_s_c = diff_sq([ctx_state, embedded_rules])
        s_m_c = L.multiply([embedded_rules, state])  # (?, rules, dim)
        sim_vec = concat([s_s_c, s_m_c, ctx_state, embedded_rules, repeated_q])
        sim_vec = att_dense1(sim_vec)  # (?, rules, dim)
        sim_vec = att_dense2(sim_vec)  # (?, rules, 1)
        # sim_vec = squeeze2(sim_vec) # (?, rules)
        # sim_vec = L.Softmax(axis=1)(sim_vec)
        # sim_vec = expand(sim_vec) # (?, rules, 1)
        sim_vec = L.multiply([sim_vec, rule_mask])

        state = episodic_mem([state, sim_vec, embedded_rules])
        sim_vec = squeeze2(sim_vec)  # (?, rules)
        outs.append(sim_vec)

    # Predication
    out = L.Dense(1, activation='sigmoid', name='out')(state)
    if pca:
        model = Model([context, query], [embedded_rules])
    elif training:
        model = Model([context, query], [out])
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['acc'])
    else:
        model = Model([context, query], outs + [out])
    return model
Exemple #40
0
 def call(self, x, mask=None):
     boolean_mask = K.any(K.greater(x, self.mask_value),
                          axis=-1, keepdims=True)
     return x * K.cast(boolean_mask, K.floatx())
Exemple #41
0
def changing_ndim_rnn_tf(step_function, inputs, initial_states, go_backwards, mask,
                         constants, unroll, input_length, eliminate_mask_dims):
    '''Iterates over the time dimension of a tensor.

    # Arguments
        inputs: tensor of temporal data of shape (samples, time, ...)
            (at least 3D).
        step_function:
            Parameters:
                input: tensor with shape (samples, ...) (no time dimension),
                    representing input for the batch of samples at a certain
                    time step.
                states: list of tensors.
            Returns:
                output: tensor with shape (samples, output_dim) (no time dimension),
                new_states: list of tensors, same length and shapes
                    as 'states'. The first state in the list must be the
                    output tensor at the previous timestep.
        initial_states: tensor with shape (samples, output_dim) (no time dimension),
            containing the initial values for the states used in
            the step function.
        go_backwards: boolean. If True, do the iteration over
            the time dimension in reverse order.
        mask: binary tensor with shape (samples, time, 1),
            with a zero for every element that is masked.
        constants: a list of constant values passed at each step.
        unroll: with TensorFlow the RNN is always unrolled, but with Theano you
            can use this boolean flag to unroll the RNN.
        input_length: not relevant in the TensorFlow implementation.
            Must be specified if using unrolling with Theano.

    # Returns
        A tuple (last_output, outputs, new_states).

        last_output: the latest output of the rnn, of shape (samples, ...)
        outputs: tensor with shape (samples, time, ...) where each
            entry outputs[s, t] is the output of the step function
            at time t for sample s.
        new_states: list of tensors, latest states returned by
            the step function, of shape (samples, ...).
    '''
    import tensorflow as tf

    ndim = len(inputs.get_shape())
    assert ndim >= 3, 'Input should be at least 3D.'
    axes = [1, 0] + list(range(2, ndim))
    inputs = tf.transpose(inputs, (axes))

    if constants is None:
        constants = []

    if unroll:
        if not inputs.get_shape()[0]:
            raise Exception('Unrolling requires a fixed number of timesteps.')

        states = initial_states
        successive_states = []
        successive_outputs = []

        input_list = tf.unpack(inputs)
        if go_backwards:
            input_list.reverse()

        if mask is not None:
            # Transpose not supported by bool tensor types, hence round-trip to uint8.
            mask = tf.cast(mask, tf.uint8)
            if len(mask.get_shape()) == ndim - 1:
                mask = K.expand_dims(mask)
            # Reshaping mask to make timesteps the first dimension.
            mask = tf.cast(tf.transpose(mask, axes), tf.bool)
            mask_list = tf.unpack(mask)

            if go_backwards:
                mask_list.reverse()

            # Iterating over timesteps.
            for input, mask_t in zip(input_list, mask_list):
                # Changing ndim modification: Pass the mask to the step function as a constant.
                output, new_states = step_function(input, states + constants + [mask_t])

                # tf.select needs its condition tensor to be the same shape as its two
                # result tensors, but in our case the condition (mask) tensor is
                # (nsamples, 1), and A and B are (nsamples, ndimensions). So we need to
                # broadcast the mask to match the shape of A and B. That's what the
                # tile call does, is just repeat the mask along its second dimension
                # ndimensions times.
                output_mask_t = tf.tile(mask_t, tf.pack(([1] * (ndim-2)) + [tf.shape(output)[1]]))

                if len(successive_outputs) == 0:
                    prev_output = K.zeros_like(output)
                else:
                    prev_output = successive_outputs[-1]

                # Changing ndim modification: Define output mask with appropriate dims eliminated.
                if eliminate_mask_dims is not None:
                    output_mask_t = tf.cast(K.any(output_mask_t, axis=eliminate_mask_dims), tf.bool)
                else:
                    output_mask_t = tf.cast(output_mask_t, tf.bool)

                output = tf.select(output_mask_t, output, prev_output)

                return_states = []
                for state, new_state in zip(states, new_states):
                    # (see earlier comment for tile explanation)
                    state_mask_t = tf.tile(mask_t, tf.pack(([1] * (ndim-2)) + [tf.shape(new_state)[1]]))
                    # Changing ndim modification: Define output mask with appropriate dims eliminated.
                    if eliminate_mask_dims is not None:
                        state_mask_t = tf.cast(K.any(state_mask_t, axis=eliminate_mask_dims), tf.bool)
                    else:
                        state_mask_t = tf.cast(state_mask_t, tf.bool)
                    return_states.append(tf.select(state_mask_t, new_state, state))

                states = return_states
                successive_outputs.append(output)
                successive_states.append(states)
                last_output = successive_outputs[-1]
                new_states = successive_states[-1]
                outputs = tf.pack(successive_outputs)
        else:
            for input in input_list:
                output, states = step_function(input, states + constants + [None])  # None for mask
                successive_outputs.append(output)
                successive_states.append(states)
            last_output = successive_outputs[-1]
            new_states = successive_states[-1]
            outputs = tf.pack(successive_outputs)

    else:
        from tensorflow.python.ops.rnn import _dynamic_rnn_loop

        if go_backwards:
            inputs = tf.reverse(inputs, [True] + [False] * (ndim - 1))

        states = initial_states
        nb_states = len(states)
        if nb_states == 0:
            # use dummy state, otherwise _dynamic_rnn_loop breaks
            state = inputs[:, 0, :]
            state_size = state.get_shape()[-1]
        else:
            state_size = int(states[0].get_shape()[-1])
            if nb_states == 1:
                state = states[0]
            else:
                state = tf.concat(1, states)

        if mask is not None:
            if len(initial_states) == 0:
                raise ValueError('No initial states provided! '
                                 'When using masking in an RNN, you should '
                                 'provide initial states '
                                 '(and your step function should return '
                                 'as its first state at time `t` '
                                 'the output at time `t-1`).')
            if go_backwards:
                mask = tf.reverse(mask, [True] + [False] * (ndim - 2))

            # Transpose not supported by bool tensor types, hence round-trip to uint8.
            mask = tf.cast(mask, tf.uint8)
            if len(mask.get_shape()) == ndim - 1:
                mask = K.expand_dims(mask)
            mask = tf.transpose(mask, axes)
            # Concatenate at the last dim.
            inputs = tf.concat(ndim-1, [tf.cast(mask, inputs.dtype), inputs])

            def _step(input, state):
                if nb_states > 1:
                    states = []
                    for i in range(nb_states):
                        states.append(state[:, i * state_size: (i + 1) * state_size])
                else:
                    states = [state]

                # The time dimension is not present here.
                step_ndim = ndim - 1
                # Permuting only to take out the mask.
                permuted_input = K.permute_dimensions(input, (step_ndim-1,) + tuple(range(step_ndim-1)))
                mask_t = K.expand_dims(permuted_input[0])
                permuted_input = permuted_input[1:]
                input = K.permute_dimensions(permuted_input, tuple(range(1, step_ndim)) + (0,))
                # changing ndim fix: eliminate necessary dims after selecting the mask from the input.
                if eliminate_mask_dims is not None:
                    output_mask_t = K.sum(mask_t, axis=eliminate_mask_dims)

                mask_t = tf.cast(mask_t, tf.bool)
                output_mask_t = tf.cast(output_mask_t, tf.bool)
                
                output, new_states = step_function(input, states + constants + [mask_t])

                tiled_output_mask_t = tf.tile(output_mask_t, tf.pack([1, tf.shape(output)[1]]))
                output = tf.select(tiled_output_mask_t, output, states[0])

                return_states = []
                for state, new_state in zip(states, new_states):
                    tiled_state_mask_t = tf.tile(output_mask_t, tf.pack([1, tf.shape(state)[1]]))
                    return_states.append(tf.select(tiled_state_mask_t, new_state, state))

                if len(return_states) == 1:
                    new_state = return_states[0]
                else:
                    new_state = tf.concat(1, return_states)

                return output, new_state
        else:
            def _step(input, state):
                if nb_states > 1:
                    states = []
                    for i in range(nb_states):
                        states.append(state[:, i * state_size: (i + 1) * state_size])
                elif nb_states == 1:
                    states = [state]
                else:
                    states = []
                output, new_states = step_function(input, states + constants + [None])  # None for mask

                if len(new_states) > 1:
                    new_state = tf.concat(1, new_states)
                elif len(new_states) == 1:
                    new_state = new_states[0]
                else:
                    # return dummy state, otherwise _dynamic_rnn_loop breaks
                    new_state = output
                return output, new_state

        _step.state_size = state_size * nb_states
        # recover output size by calling _step on the first input
        slice_begin = tf.pack([0] * ndim)
        slice_size = tf.pack([1] + [-1] * (ndim - 1))
        first_input = tf.slice(inputs, slice_begin, slice_size)
        first_input = tf.squeeze(first_input, [0])
        _step.output_size = int(_step(first_input, state)[0].get_shape()[-1])

        (outputs, final_state) = _dynamic_rnn_loop(
            _step,
            inputs,
            state,
            parallel_iterations=32,
            swap_memory=True,
            sequence_length=None)

        if nb_states > 1:
            new_states = []
            for i in range(nb_states):
                new_states.append(final_state[:, i * state_size: (i + 1) * state_size])
        elif nb_states == 1:
            new_states = [final_state]
        else:
            new_states = []

        outputs_ndim = len(outputs.get_shape())
        # all this circus is to recover the last vector in the sequence.
        slice_begin = tf.pack([tf.shape(outputs)[0] - 1] + [0] * (outputs_ndim - 1))
        slice_size = tf.pack([1] + [-1] * (outputs_ndim - 1))
        last_output = tf.slice(outputs, slice_begin, slice_size)
        last_output = tf.squeeze(last_output, [0])

    axes = [1, 0] + list(range(2, len(outputs.get_shape())))
    outputs = tf.transpose(outputs, axes)
    return last_output, outputs, new_states