def normalize_mask(x, mask): '''Keep the mask align wtih the tensor x Arguments: x is a data tensor; mask is a binary tensor Rationale: keep mask at same dimensionality as x, but only with a length-1 trailing dimension. This ensures broadcastability, which is important because inferring shapes is hard and shapes are easy to get wrong. ''' mask = K.cast(mask, K.floatx()) while K.ndim(mask) != K.ndim(x): if K.ndim(mask) > K.ndim(x): mask = K.any(mask, axis=-1) elif K.ndim(mask) < K.ndim(x): mask = K.expand_dims(mask) return K.any(mask, axis=-1, keepdims=True)
def compute_mask(self, x, mask=None): if mask is None or mask.ndim==2: return None elif mask.ndim==3: mask = K.any(mask, axis=(1,2)) else: raise Exception("Unexpected situation")
def _gen_local_drops(self, count, p): # Create a local droppath with at least one path arr = self._random_arr(count, p) drops = K.switch( K.any(arr), arr, self._arr_with_one(count) ) return drops
def compute_mask(self, x, mask=None): if mask is None: return None #import pdb #pdb.set_trace() target_dim = K.ndim(x) - 2 num_reducing = K.ndim(mask) - target_dim if num_reducing: axes = tuple([-i for i in range(1,num_reducing+1)]) mask = K.any(mask, axes) return mask
def call(self, x, mask=None): # x: (batch_size, input_length, input_dim) if mask is None: return K.mean(x, axis=1) # (batch_size, input_dim) else: # This is to remove padding from the computational graph. if K.ndim(mask) > K.ndim(x): # This is due to the bug in Bidirectional that is passing the input mask # instead of computing output mask. # TODO: Fix the implementation of Bidirectional. mask = K.any(mask, axis=(-2, -1)) if K.ndim(mask) < K.ndim(x): mask = K.expand_dims(mask) masked_input = switch(mask, x, K.zeros_like(x)) weights = K.cast(mask / (K.sum(mask) + K.epsilon()), 'float32') return K.sum(masked_input * weights, axis=1) # (batch_size, input_dim)
def call(self, x, mask=None): # x: (batch_size, input_length, input_dim) where input_length = head_size + 2 head_encoding = x[:, :-2, :] # (batch_size, head_size, input_dim) prep_encoding = x[:, -2, :] # (batch_size, input_dim) child_encoding = x[:, -1, :] # (batch_size, input_dim) if self.composition_type == 'HPCD': # TODO: The following line may not work with TF. # (batch_size, head_size, input_dim, 1) * (1, head_size, input_dim, proj_dim) head_proj_prod = K.expand_dims(head_encoding) * K.expand_dims(self.dist_proj_head, dim=0) head_projection = K.sum(head_proj_prod, axis=2) # (batch_size, head_size, proj_dim) else: head_projection = K.dot(head_encoding, self.proj_head) # (batch_size, head_size, proj_dim) prep_projection = K.expand_dims(K.dot(prep_encoding, self.proj_prep), dim=1) # (batch_size, 1, proj_dim) child_projection = K.expand_dims(K.dot(child_encoding, self.proj_child), dim=1) # (batch_size, 1, proj_dim) #(batch_size, head_size, proj_dim) if self.composition_type == 'HPCT': composed_projection = K.tanh(head_projection + prep_projection + child_projection) elif self.composition_type == 'HPC' or self.composition_type == "HPCD": prep_child_projection = K.tanh(prep_projection + child_projection) # (batch_size, 1, proj_dim) composed_projection = K.tanh(head_projection + prep_child_projection) else: # Composition type in HC composed_projection = K.tanh(head_projection + child_projection) for hidden_layer in self.hidden_layers: composed_projection = K.tanh(K.dot(composed_projection, hidden_layer)) # (batch_size, head_size, proj_dim) # (batch_size, head_size) head_word_scores = K.squeeze(K.dot(composed_projection, self.scorer), axis=-1) if mask is None: attachment_probabilities = K.softmax(head_word_scores) # (batch_size, head_size) else: if K.ndim(mask) > 2: # This means this layer came after a Bidirectional layer. Keras has this bug which # concatenates input masks instead of output masks. # TODO: Fix Bidirectional instead. mask = K.any(mask, axis=(-2, -1)) # We need to do a masked softmax. exp_scores = K.exp(head_word_scores) # (batch_size, head_size) head_mask = mask[:, :-2] # (batch_size, head_size) # (batch_size, head_size) masked_exp_scores = switch(head_mask, exp_scores, K.zeros_like(head_encoding[:, :, 0])) # (batch_size, 1). Adding epsilon to avoid divison by 0. But epsilon is float64. exp_sum = K.cast(K.expand_dims(K.sum(masked_exp_scores, axis=1) + K.epsilon()), 'float32') attachment_probabilities = masked_exp_scores / exp_sum # (batch_size, head_size) return attachment_probabilities
def call(self, x, mask=None): mean = super(IntraAttention, self).call(x, mask) # x: (batch_size, input_length, input_dim) # mean: (batch_size, input_dim) ones = K.expand_dims(K.mean(K.ones_like(x), axis=(0, 2)), dim=0) # (1, input_length) # (batch_size, input_length, input_dim) tiled_mean = K.permute_dimensions(K.dot(K.expand_dims(mean), ones), (0, 2, 1)) if mask is not None: if K.ndim(mask) > K.ndim(x): # Assuming this is because of the bug in Bidirectional. Temporary fix follows. # TODO: Fix Bidirectional. mask = K.any(mask, axis=(-2, -1)) if K.ndim(mask) < K.ndim(x): mask = K.expand_dims(mask) x = switch(mask, x, K.zeros_like(x)) # (batch_size, input_length, proj_dim) projected_combination = K.tanh(K.dot(x, self.vector_projector) + K.dot(tiled_mean, self.mean_projector)) scores = K.dot(projected_combination, self.scorer) # (batch_size, input_length) weights = K.softmax(scores) # (batch_size, input_length) attended_x = K.sum(K.expand_dims(weights) * x, axis=1) # (batch_size, input_dim) return attended_x
def _gen_local_drops(self, count, p): # Create a local droppath with at least one path arr = self._random_arr(count, p) drops = K.switch(K.any(arr), arr, self._arr_with_one(count)) return drops
def sparse_masked_mlm_loss(y_true, y_pred): mask = K.cast(K.any(y_true, axis=-1), "float32") cce = K.sparse_categorical_crossentropy(y_true, y_pred) masked_cce = mask * cce return K.sum(masked_cce) / (K.sum(mask) + K.epsilon())
def compute_mask(self, inputs, mask=None): """Computes an output mask tensor for Embedding layer. This is based on the inputs, mask, and the inner layer. If batch size is specified: Simply return the input `mask`. (An rnn-based implementation with more than one rnn inputs is required but not supported in tf.keras yet.) Otherwise we call `compute_mask` of the inner layer at each time step. If the output mask at each time step is not `None`: (E.g., inner layer is Masking or RNN) Concatenate all of them and return the concatenation. If the output mask at each time step is `None` and the input mask is not `None`:(E.g., inner layer is Dense) Reduce the input_mask to 2 dimensions and return it. Otherwise (both the output mask and the input mask are `None`): (E.g., `mask` is not used at all) Return `None`. Arguments: inputs: Tensor with shape [batch size, timesteps, ...] indicating the input to TimeDistributed. If static shape information is available for "batch size", `mask` is returned unmodified. mask: Either None (indicating no masking) or a Tensor indicating the input mask for TimeDistributed. The shape can be static or dynamic. Returns: Either None (no masking), or a [batch size, timesteps, ...] Tensor with an output mask for the TimeDistributed layer with the shape beyond the second dimension being the value of the input mask shape(if the computed output mask is none), an output mask with the shape beyond the first dimension being the value of the mask shape(if mask is not None) or output mask with the shape beyond the first dimension being the value of the computed output shape. """ # cases need to call the layer.compute_mask when input_mask is None: # Masking layer and Embedding layer with mask_zero input_shape = tf.nest.map_structure( lambda x: tf.TensorShape(K.int_shape(x)), inputs) input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False) batch_size = tf_utils.convert_shapes(input_shape) batch_size = tf.nest.flatten(batch_size)[0] is_ragged_input = tf.nest.map_structure( lambda x: isinstance(x, tf.RaggedTensor), inputs) is_ragged_input = generic_utils.to_list(tf.nest.flatten(is_ragged_input)) if batch_size and not self._always_use_reshape or any(is_ragged_input): # batch size matters, we currently do not handle mask explicitly, or if # the layer always uses reshape approach, or the input is a ragged tensor. return mask inner_mask = mask if inner_mask is not None: inner_mask_shape = self._get_shape_tuple((-1,), mask, 2) inner_mask = K.reshape(inner_mask, inner_mask_shape) inner_input_shape = tf.nest.map_structure( lambda tensor: self._get_shape_tuple((-1,), tensor, 2), inputs) inner_inputs = tf.__internal__.nest.map_structure_up_to(inputs, tf.reshape, inputs, inner_input_shape) output_mask = self.layer.compute_mask(inner_inputs, inner_mask) if output_mask is None: if mask is None: return None # input_mask is not None, and output_mask is None: # we should return a not-None mask output_mask = mask for _ in range(2, len(K.int_shape(mask))): output_mask = K.any(output_mask, axis=-1) else: # output_mask is not None. We need to reshape it input_length = tf_utils.convert_shapes(input_shape) input_length = tf.nest.flatten(input_length)[1] if not input_length: input_length = tf.nest.map_structure(lambda x: K.shape(x)[1], inputs) input_length = tf.nest.flatten(input_length)[0] output_mask_int_shape = K.int_shape(output_mask) if output_mask_int_shape is None: # if the output_mask does not have a static shape, # its shape must be the same as mask's if mask is not None: output_mask_int_shape = K.int_shape(mask) else: input_shape = generic_utils.to_list(tf.nest.flatten(input_shape))[0] output_mask_int_shape = K.compute_output_shape(input_shape)[:-1] output_mask_shape = self._get_shape_tuple( (-1, input_length), output_mask, 1, output_mask_int_shape[1:]) output_mask = K.reshape(output_mask, output_mask_shape) return output_mask
def compute_mask(self, inputs, mask=None): if mask is not None: mask = K.any(mask, axis=-1, keepdims=True) return mask
def get_updates(self, loss, params): self.updates = [] self.updates.append(K.update_add(self.state_counter, 1)) self.updates.append(K.update_add(self.iterator, 1)) self.updates.append(K.update_add(self.iterations, 1)) lr = self.lr ## lr exponential decay if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) shapes = [K.int_shape(p) for p in params] x = [K.update(K.zeros(shape), p) for shape, p in zip(shapes, params)] mu = [K.update(K.zeros(shape), p) for shape, p in zip(shapes, params)] grads = self.get_gradients(loss, params) moments = [ K.zeros(shape, name='moment_' + str(i)) for (i, shape) in enumerate(shapes) ] for x_i, x_prime_i, mu_i, g, m in zip(x, params, mu, grads, moments): ## we update x_prime (if we are in LAngevin steps, we update, otherwise we switch to parameters x_i) dx_prime_i = g - self.gamma * (x_i - x_prime_i) x_prime_update_i = K.switch( K.any(K.stack([ K.equal(self.state_counter, 0), K.equal(self.num_steps, self.iterator) ], axis=0), axis=0), x_i, x_prime_i - self.sgld_step * dx_prime_i + K.sqrt(self.sgld_step) * self.sgld_noise * K.random_normal(K.int_shape(x_prime_i))) # Apply constraints. if getattr(x_prime_i, 'constraint', None) is not None: x_prime_update_i = x_prime_i.constraint(x_prime_update_i) self.updates.append(K.update(x_prime_i, x_prime_update_i)) ## We update mu (if we are in LAngevin steps, we update otherwise we switch to parameters x_i) mu_update_i = K.switch(K.equal(self.state_counter, 0), x_i, (1 - self.alpha) * mu_i + self.alpha * x_prime_i) self.updates.append(K.update(mu_i, mu_update_i)) ## As they described in the paper, we remove the gamma from the update because it interferes with the learning annealing ## After each outer loop update we apply an exponential decay on gamma ## The following lines concerns the outer loop updates ## Nesterov's momentum gradient = (x_i - mu_i) v = self.momentum * m - lr * gradient # velocity self.updates.append( K.update( m, K.switch(K.equal(self.state_counter, self.L + 1), v, m))) if self.nesterov: new_x_i = x_i + self.momentum * v - lr * gradient else: new_x_i = x_i + v x_i_update = K.switch(K.equal(self.state_counter, self.L + 1), new_x_i, x_i) self.updates.append(K.update(x_i, x_i_update)) ## Gamma scoping gamma_update = K.switch(K.equal(self.state_counter, self.L + 1), self.gamma, self.gamma * (1. + self.scoping)) self.updates.append(K.update(self.gamma, gamma_update)) counter = K.switch(K.equal(self.state_counter, self.L + 2), K.constant(0, dtype='int64'), self.state_counter) self.updates.append(K.update(self.state_counter, counter)) return self.updates
def compute_mask(self, inputs, mask=None): # pylint: disable=unused-argument if mask is None: return None return K.any(mask, axis=self.axis)
def compute_mask(self, input, mask=None): if mask is not None and self.learn_mode == 'join': return K.any(mask, axis=1) return mask
def call(self, inputs, **kwargs): mask = K.any(K.not_equal(inputs, self.mask_value), axis=-1) return K.cast(mask, K.floatx())
def bool_match(y_true, y_pred): return K.switch(K.any(y_true - y_pred.round()), K.variable(0), K.variable(1))
def compute_mask(x, mask_value=0): boolean_mask = K.any(K.not_equal(x, mask_value), axis=-1, keepdims=False) return K.cast(boolean_mask, K.floatx())
def compute_mask(self, inputs, mask=None): masked_cols = K.cast(K.any(K.not_equal(inputs, self.mask_value), axis=-1, keepdims=True), K.floatx()) masked_rows = K.cast(K.any(K.not_equal(inputs, self.mask_value), axis=-2, keepdims=True), K.floatx()) return K.batch_dot(masked_cols, masked_rows)
def call(self, x, mask=None): assert mask is None return K.cast(K.any(x, axis=-1), K.floatx())
def call(self, inputs, mask=None): boolean_mask = K.any(K.not_equal(inputs, self.mask_value), axis=-1, keepdims=self.keepdims) if mask is not None: boolean_mask*=mask return K.cast(boolean_mask, K.floatx())
def true_label_exists(y_true): return K.any(y_true, axis=0)
def compute_mask(self, x, input_mask=None): # pylint: disable=unused-argument if input_mask is None: return None else: return K.any(input_mask, axis=-1)
def compute_mask(self, inputs, mask=None): if len(inputs.shape) > 2 and mask is not None: mask = K.any(mask, axis=-1, keepdims=False) else: #don't return mask return None
def compute_mask(self, inputs, mask=None): return K.any(K.not_equal(inputs, 0.), axis=[2, 3, 4])
def squash_mask(self, mask): if K.ndim(mask) == 2: return mask elif K.ndim(mask) == 3: return K.any(mask, axis=-1)
def pfn_mask_func(X, mask_val=mask_val): # map mask_val to zero and return 1 elsewhere return K.cast(K.any(K.not_equal(X, mask_val), axis=-1), K.dtype(X))
def compute_mask(self, input, mask=None): if mask is not None: return K.any(mask, axis=1) return mask
def call(self, inputs): boolean_mask = K.any(K.not_equal(inputs[1], self.mask_value), axis=-1, keepdims=True) return inputs[0] * K.cast(boolean_mask, K.dtype(inputs[0]))
def compute_mask(self, x, input_mask=None): return K.any(K.greater(x, self.mask_value), axis=-1)
def MaskingHack(x): #mask = K.repeat_elements( K.any(x[:,:,0:-2], axis=-1), rep=x.shape[-1], axis=-1 ) mask = K.any(x[:,:,0:-2], axis=-1, keepdims=True) return x*mask
def build_model(char_size=27, dim=64, iterations=4, training=True, ilp=False, pca=False): """Build the model.""" # Inputs # Context: (rules, preds, chars,) # context = L.Input(shape=(None, None, None,), name='context', dtype='int32') # query = L.Input(shape=(None,), name='query', dtype='int32') if ilp: context, query, templates = ilp # Contextual embeddeding of symbols # texts = [] # list of text samples # id_list = [] # question_list = [] # label_list = [] # labels_index = {} # dictionary mapping label name to numeric id # labels = [] # list of label ids # TEXT_DATA_DIR = os.path.abspath('.') + "/data/pararule" # # TEXT_DATA_DIR = "D:\\AllenAI\\20_newsgroup" # Str = '.jsonl' # CONTEXT_TEXTS = [] # test_str = 'test' # meta_str = 'meta' # for name in sorted(os.listdir(TEXT_DATA_DIR)): # path = os.path.join(TEXT_DATA_DIR, name) # if os.path.isdir(path): # label_id = len(labels_index) # labels_index[name] = label_id # for fname in sorted(os.listdir(path)): # fpath = os.path.join(path, fname) # if Str in fpath: # if test_str not in fpath: # if meta_str not in fpath: # with open(fpath) as f: # for l in json_lines.reader(f): # if l["id"] not in id_list: # id_list.append(l["id"]) # questions = l["questions"] # context = l["context"].replace("\n", " ") # context = re.sub(r'\s+', ' ', context) # CONTEXT_TEXTS.append(context) # for i in range(len(questions)): # text = questions[i]["text"] # label = questions[i]["label"] # if label == True: # t = 1 # else: # t = 0 # q = re.sub(r'\s+', ' ', text) # texts.append(context) # question_list.append(q) # label_list.append(int(t)) # f.close() # # labels.append(label_id) print('Found %s texts.' % len(CONTEXT_TEXTS)) # MAX_NB_WORDS = 20000 # MAX_SEQUENCE_LENGTH = 1000 # tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) # tokenizer.fit_on_texts(texts) # #sequences = tokenizer.texts_to_sequences(texts) word_index = WORD_INDEX print('Found %s unique tokens.' % len(word_index)) #data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # labels = to_categorical(np.asarray(labels)) #print('Shape of data tensor:', data.shape) # print('Shape of label tensor:', labels.shape) # split the data into a training set and a validation set # indices = np.arange(data.shape[0]) # np.random.shuffle(indices) # data = data[indices] # labels = labels[indices] embeddings_index = {} GLOVE_DIR = os.path.abspath('.') + "/data/glove" f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), 'r', encoding='utf-8') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('Found %s word vectors.' % len(embeddings_index)) EMBEDDING_DIM = 100 embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector embedding_layer = L.Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False) context = L.Input(shape=( None, None, None, ), name='context', dtype='int32') query = L.Input(shape=(None, ), name='query', dtype='int32') embedded_ctx = embedding_layer( context) # (?, rules, preds, chars, char_size) embedded_q = embedding_layer(query) # (?, chars, char_size) #onehot_weights = np.eye(char_size) #onehot_weights[0, 0] = 0 # Clear zero index # onehot = L.Embedding(char_size, char_size, # trainable=False, # weights=[onehot_weights], # name='onehot') # embedded_ctx = onehot(context) # (?, rules, preds, chars, char_size) # embedded_q = onehot(query) # (?, chars, char_size) if ilp: # Combine the templates with the context, (?, rules+temps, preds, chars, char_size) embedded_ctx = L.Lambda(lambda xs: K.concatenate(xs, axis=1), name='template_concat')( [templates, embedded_ctx]) # embedded_ctx = L.concatenate([templates, embedded_ctx], axis=1) embed_pred = ZeroGRU(dim, go_backwards=True, name='embed_pred') embedded_predq = embed_pred(embedded_q) # (?, dim) # For every rule, for every predicate, embed the predicate embedded_ctx_preds = L.TimeDistributed(L.TimeDistributed(embed_pred, name='nest1'), name='nest2')(embedded_ctx) # (?, rules, preds, dim) # embed_rule = ZeroGRU(dim, go_backwards=True, name='embed_rule') # embedded_rules = NestedTimeDist(embed_rule, name='d_embed_rule')(embedded_ctx_preds) get_heads = L.Lambda(lambda x: x[:, :, 0, :], name='rule_heads') embedded_rules = get_heads(embedded_ctx_preds) # (?, rules, dim) # Reused layers over iterations repeat_toctx = L.RepeatVector(K.shape(embedded_ctx)[1], name='repeat_to_ctx') diff_sq = L.Lambda(lambda xy: K.square(xy[0] - xy[1]), output_shape=(None, dim), name='diff_sq') mult = L.Multiply() concat = L.Lambda(lambda xs: K.concatenate(xs, axis=2), output_shape=(None, dim * 5), name='concat') att_densel = L.Dense(dim // 2, activation='tanh', name='att_densel') att_dense = L.Dense(1, activation='sigmoid', name='att_dense') squeeze2 = L.Lambda(lambda x: K.squeeze(x, 2), name='sequeeze2') rule_mask = L.Lambda(lambda x: K.cast( K.any(K.not_equal(x, 0), axis=-1, keepdims=True), 'float32'), name='rule_mask')(embedded_rules) unifier = NestedTimeDist(ZeroGRU(dim, name='unifier'), name='dist_unifier') dot11 = L.Dot((1, 1)) # gating = L.Dense(1, activation='sigmoid', name='gating') # gate2 = L.Lambda(lambda xyg: xyg[2]*xyg[0] + (1-xyg[2])*xyg[1], name='gate') # Reasoning iterations state = embedded_predq repeated_q = repeat_toctx(embedded_predq) outs = list() for _ in range(iterations): # Compute attention between rule and query state ctx_state = repeat_toctx(state) # (?, rules, dim) s_s_c = diff_sq([ctx_state, embedded_rules]) s_m_c = mult([embedded_rules, state]) # (?, rules, dim) sim_vec = concat([s_s_c, s_m_c, ctx_state, embedded_rules, repeated_q]) sim_vec = att_densel(sim_vec) # (?, rules, dim//2) sim_vec = att_dense(sim_vec) # (?, rules, 1) sim_vec = mult([sim_vec, rule_mask]) sim_vec = squeeze2(sim_vec) # (?, rules) # sim_vec = L.Softmax(axis=1)(sim_vec) outs.append(sim_vec) # Unify every rule and weighted sum based on attention new_states = unifier(embedded_ctx_preds, initial_state=[state]) # (?, rules, dim) state = dot11([sim_vec, new_states]) # Apply gating # gate = gating(state) # outs.append(gate) # state = gate2([state, new_state, gate]) # Predication out = L.Dense(1, activation='sigmoid', name='out')(state) if ilp: return outs, out elif pca: model = Model([context, query], [embedded_rules]) elif training: model = Model([context, query], [out]) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc']) else: model = Model([context, query], outs + [out]) return model
name='lstm2'), name='bidir_2')) #Building Keras MTLSTM model with short-cut fix for keras masking + Bidirectional issue x = Input(shape=(None, 300)) y = Masking(mask_value=0., input_shape=(None, 300))(x) y = Bidirectional(LSTM(300, return_sequences=True, recurrent_activation='sigmoid', name='lstm1'), name='bidir_1')(y) y = Bidirectional(LSTM(300, return_sequences=True, recurrent_activation='sigmoid', name='lstm2'), name='bidir_2')(y) # These 2 layer are short-cut fix for the issue - y_rev_mask_fix = Lambda(lambda x: K.cast( K.any(K.not_equal(x, 0.), axis=-1, keepdims=True), K.floatx()))(x) y = Multiply()([y, y_rev_mask_fix]) keras_model = Model(inputs=x, outputs=y) # Load the Python3 port of the model - MAKE SURE THIS FILE EXISTS BEFORE RUNNING THE SCRIPT - GET IT FROM https://github.com/rgsachin/CoVe keras_model.load_weights('Keras_CoVe.h5') # Save a new Python2 port of the model keras_model.save('Keras_CoVe_Python2.h5') print("Done")
def masked_loss(y_true, y_pred): y_mask = K.cast(K.any(y_true, axis=-1), "float32") loss = K.switch(y_mask, K.sparse_categorical_crossentropy(y_true, y_pred), K.zeros_like(y_mask, dtype=K.floatx())) return K.sum(loss) / (K.cast(K.sum(y_mask), dtype='float32') + K.epsilon())
def get_seq_length(x): return K.sum(K.cast(K.any(K.not_equal(x, 0), axis=-1), K.floatx()), axis=-1)
def compute_mask(self, inputs, input_mask=None): # pylint: disable=unused-argument options = inputs[2] padding_mask = K.not_equal(options, K.zeros_like(options)) return K.cast(K.any(padding_mask, axis=2), "float32")
def _gen_local_drops(self, count, p): arr = self._random_arr(count, p) drops = K.switch(K.any(arr), arr, self._arr_with_one(count)) return drops
def compute_mask(self, inputs, mask=None): return K.any(K.not_equal(inputs, 0.), axis=[2, 3, 4])
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) # first update the number of iterations self.updates = [K.update_add(self.iterations, 1)] if self.decay_epochs: ite_casted = K.cast(self.iterations, K.dtype(self.decay_epochs)) hit_decay_epoch = K.any(K.equal(ite_casted, self.decay_epochs)) print(hit_decay_epoch) lr = K.switch(hit_decay_epoch, self.lr['all'] * self.decay['all'], self.lr['all']) K.print_tensor(self.lr['all']) #a = K.switch(hit_decay_epoch, # K.print_tensor(self.lr['all'],message='Decays:'), # K.print_tensor(self.lr['all'],message=' ')) self.updates.append(K.update(self.lr['all'], lr)) shapes = [K.int_shape(p) for p in params] moments = [K.zeros(s) for s in shapes] self.weights = [self.iterations] + moments print(self.weights) for p, g, m in zip(params, grads, moments): #print("HEREEEE:", p.name, g, m) if p.name in self.lr.keys(): if self.verbose > 0: print("Setting different learning rate for ", p.name, " : ", K.eval(self.lr[p.name])) lr = self.lr[p.name] if self.decay_epochs and p.name in self.decay.keys(): lr = K.switch(hit_decay_epoch, self.lr[p.name] * self.decay[p.name], self.lr[p.name]) self.updates.append(K.update(self.lr[p.name], lr)) if self.verbose > 0: print("Added decay to ", p.name, ": ", K.eval(lr), ",", self.decay[p.name]) elif self.decay_epochs: lr = K.switch(hit_decay_epoch, self.lr[p.name] * self.decay['all'], self.lr[p.name]) self.updates.append(K.update(self.lr[p.name], lr)) if self.verbose > 0: print("Added decay to ", p.name, ": ", K.eval(lr), ",", self.decay['all']) else: lr = self.lr[p.name] else: lr = self.lr['all'] if p.name in self.momentum.keys(): if self.verbose > 0: print("Setting different momentum for ", p.name, " , ", K.eval(self.momentum[p.name])) momentum = self.momentum[p.name] else: momentum = self.momentum['all'] v = momentum * m - lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + momentum * (momentum * m - lr * g) - lr * g else: new_p = p + momentum * m - lr * g # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) if self.clips_val and (p.name in self.clips.keys()): if self.verbose > 0: print("Clipping variable", p.name, " to ", self.clips[p.name]) c = K.eval(self.clips[p.name]) new_p = K.clip(new_p, c[0], c[1]) print("updates for ", p.name, " lr: ", K.eval(lr), " mom:", K.eval(momentum)) self.updates.append(K.update(p, new_p)) return self.updates
def build_model(char_size=27, dim=64, iterations=4, training=True, pca=False): """Build the model.""" # Inputs # Context: (rules, preds, chars,) context = L.Input(shape=( None, None, None, ), name='context', dtype='int32') query = L.Input(shape=(None, ), name='query', dtype='int32') # Flatten preds to embed entire rules var_flat = L.Lambda(lambda x: K.reshape( x, K.stack([K.shape(x)[0], -1, K.prod(K.shape(x)[2:])])), name='var_flat') flat_ctx = var_flat(context) # (?, rules, preds*chars) # Onehot embedding # Contextual embeddeding of symbols onehot_weights = np.eye(char_size) onehot_weights[0, 0] = 0 # Clear zero index onehot = L.Embedding(char_size, char_size, trainable=False, weights=[onehot_weights], name='onehot') embedded_ctx = onehot(flat_ctx) # (?, rules, preds*chars*char_size) embedded_q = onehot(query) # (?, chars, char_size) embed_pred = ZeroGRU(dim, go_backwards=True, name='embed_pred') embedded_predq = embed_pred(embedded_q) # (?, dim) # Embed every rule embedded_rules = NestedTimeDist(embed_pred, name='rule_embed')(embedded_ctx) # (?, rules, dim) # Reused layers over iterations repeat_toctx = L.RepeatVector(K.shape(embedded_ctx)[1], name='repeat_to_ctx') diff_sq = L.Lambda(lambda xy: K.square(xy[0] - xy[1]), output_shape=(None, dim), name='diff_sq') concat = L.Lambda(lambda xs: K.concatenate(xs, axis=2), output_shape=(None, dim * 5), name='concat') att_dense1 = L.TimeDistributed(L.Dense(dim, activation='tanh', name='att_dense1'), name='d_att_dense1') att_dense2 = L.TimeDistributed(L.Dense(1, activation='sigmoid', name='att_dense2'), name='d_att_dense2') squeeze2 = L.Lambda(lambda x: K.squeeze(x, 2), name='sequeeze2') # expand = L.Lambda(lambda x: K.expand_dims(x, axis=2), name='expand') rule_mask = L.Lambda(lambda x: K.cast( K.any(K.not_equal(x, 0), axis=-1, keepdims=True), 'float32'), name='rule_mask')(embedded_rules) episodic_mem = EpisodicMemory(dim, name='episodic_mem') # Reasoning iterations state = embedded_predq repeated_q = repeat_toctx(embedded_predq) outs = list() for _ in range(iterations): # Compute attention between rule and query state ctx_state = repeat_toctx(state) # (?, rules, dim) s_s_c = diff_sq([ctx_state, embedded_rules]) s_m_c = L.multiply([embedded_rules, state]) # (?, rules, dim) sim_vec = concat([s_s_c, s_m_c, ctx_state, embedded_rules, repeated_q]) sim_vec = att_dense1(sim_vec) # (?, rules, dim) sim_vec = att_dense2(sim_vec) # (?, rules, 1) # sim_vec = squeeze2(sim_vec) # (?, rules) # sim_vec = L.Softmax(axis=1)(sim_vec) # sim_vec = expand(sim_vec) # (?, rules, 1) sim_vec = L.multiply([sim_vec, rule_mask]) state = episodic_mem([state, sim_vec, embedded_rules]) sim_vec = squeeze2(sim_vec) # (?, rules) outs.append(sim_vec) # Predication out = L.Dense(1, activation='sigmoid', name='out')(state) if pca: model = Model([context, query], [embedded_rules]) elif training: model = Model([context, query], [out]) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc']) else: model = Model([context, query], outs + [out]) return model
def call(self, x, mask=None): boolean_mask = K.any(K.greater(x, self.mask_value), axis=-1, keepdims=True) return x * K.cast(boolean_mask, K.floatx())
def changing_ndim_rnn_tf(step_function, inputs, initial_states, go_backwards, mask, constants, unroll, input_length, eliminate_mask_dims): '''Iterates over the time dimension of a tensor. # Arguments inputs: tensor of temporal data of shape (samples, time, ...) (at least 3D). step_function: Parameters: input: tensor with shape (samples, ...) (no time dimension), representing input for the batch of samples at a certain time step. states: list of tensors. Returns: output: tensor with shape (samples, output_dim) (no time dimension), new_states: list of tensors, same length and shapes as 'states'. The first state in the list must be the output tensor at the previous timestep. initial_states: tensor with shape (samples, output_dim) (no time dimension), containing the initial values for the states used in the step function. go_backwards: boolean. If True, do the iteration over the time dimension in reverse order. mask: binary tensor with shape (samples, time, 1), with a zero for every element that is masked. constants: a list of constant values passed at each step. unroll: with TensorFlow the RNN is always unrolled, but with Theano you can use this boolean flag to unroll the RNN. input_length: not relevant in the TensorFlow implementation. Must be specified if using unrolling with Theano. # Returns A tuple (last_output, outputs, new_states). last_output: the latest output of the rnn, of shape (samples, ...) outputs: tensor with shape (samples, time, ...) where each entry outputs[s, t] is the output of the step function at time t for sample s. new_states: list of tensors, latest states returned by the step function, of shape (samples, ...). ''' import tensorflow as tf ndim = len(inputs.get_shape()) assert ndim >= 3, 'Input should be at least 3D.' axes = [1, 0] + list(range(2, ndim)) inputs = tf.transpose(inputs, (axes)) if constants is None: constants = [] if unroll: if not inputs.get_shape()[0]: raise Exception('Unrolling requires a fixed number of timesteps.') states = initial_states successive_states = [] successive_outputs = [] input_list = tf.unpack(inputs) if go_backwards: input_list.reverse() if mask is not None: # Transpose not supported by bool tensor types, hence round-trip to uint8. mask = tf.cast(mask, tf.uint8) if len(mask.get_shape()) == ndim - 1: mask = K.expand_dims(mask) # Reshaping mask to make timesteps the first dimension. mask = tf.cast(tf.transpose(mask, axes), tf.bool) mask_list = tf.unpack(mask) if go_backwards: mask_list.reverse() # Iterating over timesteps. for input, mask_t in zip(input_list, mask_list): # Changing ndim modification: Pass the mask to the step function as a constant. output, new_states = step_function(input, states + constants + [mask_t]) # tf.select needs its condition tensor to be the same shape as its two # result tensors, but in our case the condition (mask) tensor is # (nsamples, 1), and A and B are (nsamples, ndimensions). So we need to # broadcast the mask to match the shape of A and B. That's what the # tile call does, is just repeat the mask along its second dimension # ndimensions times. output_mask_t = tf.tile(mask_t, tf.pack(([1] * (ndim-2)) + [tf.shape(output)[1]])) if len(successive_outputs) == 0: prev_output = K.zeros_like(output) else: prev_output = successive_outputs[-1] # Changing ndim modification: Define output mask with appropriate dims eliminated. if eliminate_mask_dims is not None: output_mask_t = tf.cast(K.any(output_mask_t, axis=eliminate_mask_dims), tf.bool) else: output_mask_t = tf.cast(output_mask_t, tf.bool) output = tf.select(output_mask_t, output, prev_output) return_states = [] for state, new_state in zip(states, new_states): # (see earlier comment for tile explanation) state_mask_t = tf.tile(mask_t, tf.pack(([1] * (ndim-2)) + [tf.shape(new_state)[1]])) # Changing ndim modification: Define output mask with appropriate dims eliminated. if eliminate_mask_dims is not None: state_mask_t = tf.cast(K.any(state_mask_t, axis=eliminate_mask_dims), tf.bool) else: state_mask_t = tf.cast(state_mask_t, tf.bool) return_states.append(tf.select(state_mask_t, new_state, state)) states = return_states successive_outputs.append(output) successive_states.append(states) last_output = successive_outputs[-1] new_states = successive_states[-1] outputs = tf.pack(successive_outputs) else: for input in input_list: output, states = step_function(input, states + constants + [None]) # None for mask successive_outputs.append(output) successive_states.append(states) last_output = successive_outputs[-1] new_states = successive_states[-1] outputs = tf.pack(successive_outputs) else: from tensorflow.python.ops.rnn import _dynamic_rnn_loop if go_backwards: inputs = tf.reverse(inputs, [True] + [False] * (ndim - 1)) states = initial_states nb_states = len(states) if nb_states == 0: # use dummy state, otherwise _dynamic_rnn_loop breaks state = inputs[:, 0, :] state_size = state.get_shape()[-1] else: state_size = int(states[0].get_shape()[-1]) if nb_states == 1: state = states[0] else: state = tf.concat(1, states) if mask is not None: if len(initial_states) == 0: raise ValueError('No initial states provided! ' 'When using masking in an RNN, you should ' 'provide initial states ' '(and your step function should return ' 'as its first state at time `t` ' 'the output at time `t-1`).') if go_backwards: mask = tf.reverse(mask, [True] + [False] * (ndim - 2)) # Transpose not supported by bool tensor types, hence round-trip to uint8. mask = tf.cast(mask, tf.uint8) if len(mask.get_shape()) == ndim - 1: mask = K.expand_dims(mask) mask = tf.transpose(mask, axes) # Concatenate at the last dim. inputs = tf.concat(ndim-1, [tf.cast(mask, inputs.dtype), inputs]) def _step(input, state): if nb_states > 1: states = [] for i in range(nb_states): states.append(state[:, i * state_size: (i + 1) * state_size]) else: states = [state] # The time dimension is not present here. step_ndim = ndim - 1 # Permuting only to take out the mask. permuted_input = K.permute_dimensions(input, (step_ndim-1,) + tuple(range(step_ndim-1))) mask_t = K.expand_dims(permuted_input[0]) permuted_input = permuted_input[1:] input = K.permute_dimensions(permuted_input, tuple(range(1, step_ndim)) + (0,)) # changing ndim fix: eliminate necessary dims after selecting the mask from the input. if eliminate_mask_dims is not None: output_mask_t = K.sum(mask_t, axis=eliminate_mask_dims) mask_t = tf.cast(mask_t, tf.bool) output_mask_t = tf.cast(output_mask_t, tf.bool) output, new_states = step_function(input, states + constants + [mask_t]) tiled_output_mask_t = tf.tile(output_mask_t, tf.pack([1, tf.shape(output)[1]])) output = tf.select(tiled_output_mask_t, output, states[0]) return_states = [] for state, new_state in zip(states, new_states): tiled_state_mask_t = tf.tile(output_mask_t, tf.pack([1, tf.shape(state)[1]])) return_states.append(tf.select(tiled_state_mask_t, new_state, state)) if len(return_states) == 1: new_state = return_states[0] else: new_state = tf.concat(1, return_states) return output, new_state else: def _step(input, state): if nb_states > 1: states = [] for i in range(nb_states): states.append(state[:, i * state_size: (i + 1) * state_size]) elif nb_states == 1: states = [state] else: states = [] output, new_states = step_function(input, states + constants + [None]) # None for mask if len(new_states) > 1: new_state = tf.concat(1, new_states) elif len(new_states) == 1: new_state = new_states[0] else: # return dummy state, otherwise _dynamic_rnn_loop breaks new_state = output return output, new_state _step.state_size = state_size * nb_states # recover output size by calling _step on the first input slice_begin = tf.pack([0] * ndim) slice_size = tf.pack([1] + [-1] * (ndim - 1)) first_input = tf.slice(inputs, slice_begin, slice_size) first_input = tf.squeeze(first_input, [0]) _step.output_size = int(_step(first_input, state)[0].get_shape()[-1]) (outputs, final_state) = _dynamic_rnn_loop( _step, inputs, state, parallel_iterations=32, swap_memory=True, sequence_length=None) if nb_states > 1: new_states = [] for i in range(nb_states): new_states.append(final_state[:, i * state_size: (i + 1) * state_size]) elif nb_states == 1: new_states = [final_state] else: new_states = [] outputs_ndim = len(outputs.get_shape()) # all this circus is to recover the last vector in the sequence. slice_begin = tf.pack([tf.shape(outputs)[0] - 1] + [0] * (outputs_ndim - 1)) slice_size = tf.pack([1] + [-1] * (outputs_ndim - 1)) last_output = tf.slice(outputs, slice_begin, slice_size) last_output = tf.squeeze(last_output, [0]) axes = [1, 0] + list(range(2, len(outputs.get_shape()))) outputs = tf.transpose(outputs, axes) return last_output, outputs, new_states