def angular_loss_2(y_true, y_pred): y_pred = K.clip(y_pred, _EPSILON, 1.0 - _EPSILON) loss = tf.convert_to_tensor(0, dtype=tf.float32) g = tf.constant(1.0, shape=[1], dtype=tf.float32) c = tf.constant(4.0, shape=[1], dtype=tf.float32) d = tf.constant(2.0, shape=[1], dtype=tf.float32) alpha = tf.constant(45.0, shape=[1], dtype=tf.float32) losses = [] losses2 = [] for i in range(0, batch_size, 3): try: xa = y_pred[i + 0] xp = y_pred[i + 1] xn = y_pred[i + 2] fapn = c * (tf.tan(alpha * K.transpose(xa + xp) * xn)** 2) - d * (g + tf.tan(alpha)**2) * K.transpose(xa) * xp losses.append(fapn) losses2.append(K.transpose(xa) * xn - K.transpose(xa) * xp) loss = (loss + g + _loss) except: continue loss = K.sum(K.log(1 + 2 * K.sum([K.exp(v) for v in losses]))) loss2 = K.sum(K.log(1 + 2 * K.sum([K.exp(v) for v in losses2]))) loss = loss + 2 * loss2 loss = loss / (batch_size / 3) zero = tf.constant(0.0, shape=[1], dtype=tf.float32) return tf.maximum(loss, zero)
def Kget_dists(X): """Keras code to compute the pairwise distance matrix for a set of vectors specifie by the matrix X. """ x2 = K.expand_dims(K.sum(K.square(X), axis=1), 1) dists = x2 + K.transpose(x2) - 2 * K.dot(X, K.transpose(X)) return dists
def fallback_metric(self, y_true, y_pred): #grab the most confident prediction predictions = K.max(y_pred, axis=-1) #fill a tensor with our threshold_value threshold_tensor = tf.fill(tf.shape(predictions), self.threshold) #Are we confident in our prediction? threshold_high = predictions > threshold_tensor threshold_high = tf.cast(threshold_high, tf.int32) #Do we have low confidence in our prediction? threshold_low = predictions <= threshold_tensor threshold_low = tf.cast(threshold_low, tf.int32) idx_true = K.argmax(y_true, -1) idx_pred = K.argmax(y_pred, -1) #For our confident predictions, compare the top prediction to the label of the true value high_correct = math_ops.equal(idx_true, idx_pred) high_correct = tf.cast(high_correct, tf.int32) #For our less confident predictions, grab the top 2 most confident predictions _, max_pred = tf.math.top_k(y_pred, k=2) #Gather the lineages of those top 2 predictions using the transpose of the hierarchy's adjaency matrix because the adjacency only points from ancestor to descendant lineages = tf.gather(K.transpose(self.hierarchy.A), max_pred) lineages = K.cast(lineages, tf.int32) #Grab the first two columns of this matrix fallback = tf.bitwise.bitwise_and(lineages[:, 0], lineages[:, 1]) #Gather the lineage of the true value actual = tf.gather(K.transpose(self.hierarchy.A), K.argmax(y_true)) actual = K.cast(actual, tf.int32) #Multiply the two together overlap_score = K.batch_dot(fallback, actual) #Are either of the top 2 predictions in the lineage of the true value? If so, overlap_score should be >1 and we count the result as correct low_correct = overlap_score > 1 low_correct = tf.cast(low_correct, tf.int32) low_correct = tf.squeeze(low_correct) #results for the high confidence predictions high_accuracy = tf.math.multiply(threshold_high, high_correct) #results for the low confidence predictions low_accuracy = tf.math.multiply(threshold_low, low_correct) # total accuracy vector correct = high_accuracy + low_accuracy #return batch accuracy value return K.mean(K.cast(correct, tf.float32))
def call(self, inputs, **kwargs): """ student t-distribution, as same as used in t-SNE algorithm. q_ij = 1/(1+dist(x_i, u_j)^2), then normalize it. Arguments: inputs: the variable containing data, shape=(n_samples, n_features) Return: q: student's t-distribution, or soft labels for each sample. shape=(n_samples, n_clusters) """ q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha)) q **= (self.alpha + 1.0) / 2.0 q = K.transpose(K.transpose(q) / K.sum(q, axis=1)) return q
def _compute_carry_and_output(self, x, h_tm1, c_tm1, b): """Computes carry and output using split kernels.""" x_i, x_f, x_c, x_o = x h_tm1_i, h_tm1_f, h_tm1_c, h_tm1_o = h_tm1 b_i2, b_f2, b_c2, b_o2 = b i = self.recurrent_activation( x_i + K.bias_add(K.dot(h_tm1_i, K.transpose(self.recurrent_kernel[:, :self.units])), b_i2)) f = self.recurrent_activation(x_f + K.bias_add(K.dot( h_tm1_f, K.transpose(self.recurrent_kernel[:, self.units:self.units * 2])), b_f2)) c = f * c_tm1 + i * self.activation(x_c + K.bias_add(K.dot( h_tm1_c, K.transpose(self.recurrent_kernel[:, self.units * 2:self.units * 3])), b_c2)) o = self.recurrent_activation( x_o + K.bias_add(K.dot(h_tm1_o, K.transpose(self.recurrent_kernel[:, self.units * 3:])), b_o2)) return c, o
def call(self, inputs, **kwargs): main_input, embedding_matrix = inputs input_shape_tensor = K.shape(main_input) last_input_dim = K.int_shape(main_input)[-1] emb_input_dim, emb_output_dim = K.int_shape(embedding_matrix) projected = K.dot(K.reshape(main_input, (-1, last_input_dim)), self.embedding_weights['projection']) if self.add_biases: projected = K.bias_add(projected, self.embedding_weights['biases'], data_format='channels_last') if 0 < self.projection_dropout < 1: projected = K.in_train_phase( lambda: K.dropout(projected, self.projection_dropout), projected, training=kwargs.get('training')) attention = K.dot(projected, K.transpose(embedding_matrix)) if self.scaled_attention: # scaled dot-product attention, described in # "Attention is all you need" (https://arxiv.org/abs/1706.03762) sqrt_d = K.constant(math.sqrt(emb_output_dim), dtype=K.floatx()) attention = attention / sqrt_d result = K.reshape( self.activation(attention), (input_shape_tensor[0], input_shape_tensor[1], emb_input_dim)) return result
def call(self, inputs, training=None): def _l2normalize(v, eps=1e-12): return v / (K.sum(v**2)**0.5 + eps) def power_iteration(W, u): _u = u _v = _l2normalize(K.dot(_u, K.transpose(W))) _u = _l2normalize(K.dot(_v, W)) return _u, _v if self.spectral_normalization: W_shape = self.kernel.shape.as_list() # Flatten the Tensor W_reshaped = K.reshape(self.kernel, [-1, W_shape[-1]]) _u, _v = power_iteration(W_reshaped, self.u) # Calculate Sigma sigma = K.dot(_v, W_reshaped) sigma = K.dot(sigma, K.transpose(_u)) # normalize it W_bar = W_reshaped / sigma # reshape weight tensor if training in {0, False}: W_bar = K.reshape(W_bar, W_shape) else: with tf.control_dependencies([self.u.assign(_u)]): W_bar = K.reshape(W_bar, W_shape) # update weitht self.kernel = W_bar if self.rank == 1: outputs = K.conv1d(inputs, self.kernel, strides=self.strides[0], padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate[0]) if self.rank == 2: outputs = K.conv2d(inputs, self.kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.rank == 3: outputs = K.conv3d(inputs, self.kernel, strides=self.strides, padding=self.padding, data_format=self.data_format, dilation_rate=self.dilation_rate) if self.use_bias: outputs = K.bias_add(outputs, self.bias, data_format=self.data_format) if self.activation is not None: return self.activation(outputs) return outputs
def gram_matrix(x, norm_by_channels=False): ''' Returns the Gram matrix of the tensor x. ''' if K.ndim(x) == 3: features = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1))) shape = K.shape(x) C, H, W = shape[0], shape[1], shape[2] gram = K.dot(features, K.transpose(features)) elif K.ndim(x) == 4: # Swap from (H, W, C) to (B, C, H, W) x = K.permute_dimensions(x, (0, 3, 1, 2)) shape = K.shape(x) B, C, H, W = shape[0], shape[1], shape[2], shape[3] # Reshape as a batch of 2D matrices with vectorized channels features = K.reshape(x, K.stack([B, C, H * W])) # This is a batch of Gram matrices (B, C, C). gram = K.batch_dot(features, features, axes=2) else: raise ValueError( 'The input tensor should be either a 3d (H, W, C) or 4d (B, H, W, C) tensor.' ) # Normalize the Gram matrix if norm_by_channels: denominator = C * H * W # Normalization from Johnson else: denominator = H * W # Normalization from Google gram = gram / K.cast(denominator, x.dtype) return gram
def shift(shape, stride, anchors): """Produce shifted anchors based on shape of the map and stride size. Args: shape: Shape to shift the anchors over. stride: Stride to shift the anchors with over the shape. anchors: The anchors to apply at each location. Returns: shifted anchors """ shift_x = (K.arange(0, shape[1], dtype=K.floatx()) + K.constant(0.5, dtype=K.floatx())) * stride shift_y = (K.arange(0, shape[0], dtype=K.floatx()) + K.constant(0.5, dtype=K.floatx())) * stride shift_x, shift_y = tf.meshgrid(shift_x, shift_y) shift_x = K.reshape(shift_x, [-1]) shift_y = K.reshape(shift_y, [-1]) shifts = K.stack([shift_x, shift_y, shift_x, shift_y], axis=0) shifts = K.transpose(shifts) number_of_anchors = K.shape(anchors)[0] k = K.shape(shifts)[0] # number of base points = feat_h * feat_w shifts = K.cast(K.reshape(shifts, [k, 1, 4]), K.floatx()) shifted_anchors = K.reshape(anchors, [1, number_of_anchors, 4]) + shifts shifted_anchors = K.reshape(shifted_anchors, [k * number_of_anchors, 4]) return shifted_anchors
def call(self, inputs, training=None): def _l2normalize(v, eps=1e-12): return v / (K.sum(v ** 2) ** 0.5 + eps) def power_iteration(W, u): _u = u _v = _l2normalize(K.dot(_u, K.transpose(W))) _u = _l2normalize(K.dot(_v, W)) return _u, _v W_shape = self.kernel.shape.as_list() #Flatten the Tensor W_reshaped = K.reshape(self.kernel, [-1, W_shape[-1]]) _u, _v = power_iteration(W_reshaped, self.u) #Calculate Sigma sigma=K.dot(_v, W_reshaped) sigma=K.dot(sigma, K.transpose(_u)) #normalize it W_bar = W_reshaped / sigma #reshape weight tensor if training in {0, False}: W_bar = K.reshape(W_bar, W_shape) else: with tf.control_dependencies([self.u.assign(_u)]): W_bar = K.reshape(W_bar, W_shape) output = K.dot(inputs, W_bar) if self.use_bias: output = K.bias_add(output, self.bias, data_format='channels_last') if self.activation is not None: output = self.activation(output) return output
def call(self, inputs, **kwargs): if not inputs.shape[0]: return inputs recurrent_input = ops.convert_to_tensor(inputs) if not self._mixed_precision_policy.should_cast_variables: recurrent_input = math_ops.cast(recurrent_input, self.dtype) batch_size = recurrent_input.shape[0] # Flatten last two dimensions, but along dimension [2] flat_recurrent = K.reshape( K.permute_dimensions(recurrent_input, (0, 2, 1)), (batch_size, -1)) outputs = gen_math_ops.mat_mul( flat_recurrent, tf.math.multiply(self.recurrent_kernel, self.recurrent_mask)) if self.use_bias: outputs = nn.bias_add(outputs, self.bias) if self.activation is not None: outputs = self.activation(outputs) # Transform back outputs to original shape outputs = K.reshape( K.transpose(outputs), (self.target_shape[0], self.target_shape[1], batch_size)) outputs = K.reshape( outputs, (self.target_shape[1], self.target_shape[0], batch_size)) outputs = K.permute_dimensions(outputs, (2, 1, 0)) return outputs
def select_best_leaf(self, y_pred): if self.N > self.num_leaves: # if there are more leaf nodes than total nodes in the hierarchy (should always be the case, # but allowed to work either way) then pad with a zero for each non-leaf node in the taxonomy y_pred = self._pad(y_pred) # propagate the probabilities (algo 1) propagated_probabilities = K.transpose( K.dot(self.A, K.transpose(y_pred))) # grab the mask vector for root and repeat it <batch size> times root = K.repeat(self.root, K.shape(y_pred)[0]) # reshape into (<batch size>, N) predictions = K.reshape(root, (K.shape(y_pred)[0], )) # each branch will walk futher out toward leaf nodes (and loops on leaf nodes) for _ in range(self.depth): predictions = self._branch(propagated_probabilities, predictions) return predictions
def power_iteration(self, u, W): ''' Accroding the paper, we only need to do power iteration one time. ''' v = self._l2normalize(K.dot(u, K.transpose(W))) u = self._l2normalize(K.dot(v, W)) return u, v
def call(self, inputs, states, training=None): # get the standard hidden state from super output = super(STTAUCell, self).call(inputs, states) h_before = output[0] c = output[1][1] # the following part modifies the hidden state to create STTAU # sizes: B = batch size, H = hidden dimension size, # C = number of centroids # BxC = BxH & HxC unnormalized_probs = K.dot(h_before, self.centroid_kernel) # Gumbel-Softmax sample with (learnt) temperature & unnormalized_probs q_y = tfp.distributions.RelaxedOneHotCategorical( self.temperature_weight, unnormalized_probs) # BxC y = q_y.sample() if self.hard_sample is True: # y_hard is a one-hot vector with BxC y_hard = tf.cast(tf.one_hot(tf.argmax(y, -1), self.centroids), y.dtype) y = tf.stop_gradient(y_hard - y) + y # BxH = BxC & CxH h_after = K.dot(y, K.transpose(self.centroid_kernel)) # end of STTAU modification if 0 < self.dropout + self.recurrent_dropout: if training is None: h_after._uses_learning_phase = True return h_before, [h_after, c]
def build(self, input_shape): dtype = dtypes.as_dtype(self.dtype or K.floatx()) if not (dtype.is_floating or dtype.is_complex): raise TypeError('Unable to build `Dense` layer with non-floating point ' 'dtype %s' % (dtype,)) input_shape = tensor_shape.TensorShape(input_shape) if tensor_shape.dimension_value(input_shape[-1]) is None: raise ValueError('The last dimension of the inputs to `Dense` ' 'should be defined. Found `None`.') last_dim = tensor_shape.dimension_value(input_shape[-1]) self.input_spec = InputSpec(min_ndim=2, axes={-1: last_dim}) if self.tied_to is not None: self.kernel = K.transpose(self.tied_to.weights[0]) else: self.kernel = self.add_weight( 'kernel', shape=[last_dim, self.units], initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, constraint=self.kernel_constraint, dtype=self.dtype, trainable=True) if self.use_bias: self.bias = self.add_weight( 'bias', shape=[self.units,], initializer=self.bias_initializer, regularizer=self.bias_regularizer, constraint=self.bias_constraint, dtype=self.dtype, trainable=True) else: self.bias = None self.built = True
def call(self, inputs, output_shape=None): updates, mask = inputs[0], inputs[1] mask = tf.cast(mask, 'int32') input_shape = tf.shape(updates, out_type='int32') # calculation new shape if output_shape is None: output_shape = (input_shape[0], input_shape[1] * self.size[0], input_shape[2] * self.size[1], input_shape[3]) # calculation indices for batch, height, width and feature maps one_like_mask = K.ones_like(mask, dtype='int32') batch_shape = K.concatenate([[input_shape[0]], [1], [1], [1]], axis=0) batch_range = K.reshape(tf.range(output_shape[0], dtype='int32'), shape=batch_shape) b = one_like_mask * batch_range y = mask // (output_shape[2] * output_shape[3]) x = (mask // output_shape[3]) % output_shape[2] feature_range = tf.range(output_shape[3], dtype='int32') f = one_like_mask * feature_range # transpose indices & reshape update values to one dimension updates_size = tf.size(updates) indices = K.transpose( K.reshape(K.stack([b, y, x, f]), [4, updates_size])) values = K.reshape(updates, [updates_size]) ret = tf.scatter_nd(indices, values, output_shape) return ret
def call(self, inputs): if K.dtype(inputs) != 'int32': inputs = K.cast(inputs, 'int32') def _l2normalize(v, eps=1e-12): return v / (K.sum(v ** 2) ** 0.5 + eps) def power_iteration(W, u): #Accroding the paper, we only need to do power iteration one time. _u = u _v = _l2normalize(K.dot(_u, K.transpose(W))) _u = _l2normalize(K.dot(_v, W)) return _u, _v W_shape = self.embeddings.shape.as_list() #Flatten the Tensor W_reshaped = K.reshape(self.embeddings, [-1, W_shape[-1]]) _u, _v = power_iteration(W_reshaped, self.u) #Calculate Sigma sigma=K.dot(_v, W_reshaped) sigma=K.dot(sigma, K.transpose(_u)) #normalize it W_bar = W_reshaped / sigma #reshape weight tensor if training in {0, False}: W_bar = K.reshape(W_bar, W_shape) else: with tf.control_dependencies([self.u.assign(_u)]): W_bar = K.reshape(W_bar, W_shape) self.embeddings = W_bar out = K.gather(self.embeddings, inputs) return out
def call(self, inputs): X = inputs[0] # Node features (N x F) A = inputs[1] # Adjacency matrix (N x N) outputs = [] for head in range(self.attn_heads): kernel = self.kernels[head] # W in the paper (F x F") attention_kernel = self.attn_kernels[ head] # Attention kernel a in the paper (2F" x 1) # Compute inputs to attention network features = K.dot(X, kernel) # (N x F") # Compute feature combinations # Note: [[a_1], [a_2]]^T [[Wh_i], [Wh_2]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j] attn_for_self = K.dot( features, attention_kernel[0]) # (N x 1), [a_1]^T [Wh_i] attn_for_neighs = K.dot( features, attention_kernel[1]) # (N x 1), [a_2]^T [Wh_j] # Attention head a(Wh_i, Wh_j) = a^T [[Wh_i], [Wh_j]] dense = attn_for_self + K.transpose( attn_for_neighs) # (N x N) via broadcasting # Add nonlinearty dense = LeakyReLU(alpha=0.2)(dense) # Mask values before activation (Vaswani et al., 2017) mask = -10e9 * (1.0 - A) dense += mask # Apply softmax to get attention coefficients dense = K.softmax(dense) # (N x N) # Apply dropout to features and attention coefficients dropout_attn = Dropout(self.dropout_rate)(dense) # (N x N) dropout_feat = Dropout(self.dropout_rate)(features) # (N x F") # Linear combination with neighbors" features node_features = K.dot(dropout_attn, dropout_feat) # (N x F") if self.use_bias: node_features = K.bias_add(node_features, self.biases[head]) if self.attn_heads_reduction == "concat": # If "concat", compute the activation here (Eq. 5) node_features = self.activation(node_features) # Add output of attention head to final output outputs.append(node_features) # Aggregate the heads" output according to the reduction method if self.attn_heads_reduction == "concat": output = K.concatenate(outputs) # (N x KF") else: output = K.mean(K.stack(outputs), axis=0) # N x F") output = self.activation(output) return output
def gram_matrix(x): assert K.ndim(x) == 3 if K.image_dim_ordering() == "th": features = K.batch_flatten(x) else: features = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1))) gram = K.dot(features, K.transpose(features)) return gram
def gram_matrix(x): assert K.ndim(x) == 3 if K.image_data_format() == "channels_first": features = K.batch_flatten(x) else: features = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1))) gram = K.dot(features, K.transpose(features)) return gram
def compute_win(self, y_true, y_pred, to_numpy=False): if self.N > self.num_leaves: # if there are more leaf nodes than total nodes in the hierarchy (should always be the case, # but allowed to work either way) then pad with a zero for each non-leaf node in the taxonomy y_true = self._pad(y_true) y_pred = self._pad(y_pred) # propagate the probabilities (algo 1) propagated_probabilities = K.dot(self.A, K.transpose(y_pred)) # find the index from the actual label win_idx = self.select_correct_idx(y_true) # find the mask associated with that label win_mask = tf.gather(self.W, win_idx) # win is q . w (algo 2) win = K.batch_dot(win_mask, K.transpose(propagated_probabilities)) # win is in [0.5,1], remap to [0,1]: remapped = 2 * (win - 0.5) if to_numpy: remapped = K.reshape(remapped, []).numpy() return remapped
def call(self, x, mask=None): # print(x[0].shape) # print(x[1].shape) # x[0] is Nx2, x[1] is Nx8 onehot, self.centers is 8x2 delta_centers = K.dot(K.transpose(x[1]), (K.dot(x[1], self.centers) - x[0])) # 8x2 center_counts = K.sum(K.transpose(x[1]), axis=1, keepdims=True) + 1 # 8x1 delta_centers /= center_counts new_centers = self.centers - self.alpha * delta_centers self.add_update((self.centers, new_centers), x) # self.add_update((self.counter, self.counter + 1), x) self.result = x[0] - K.dot(x[1], self.centers) self.result = K.sum(self.result**2, axis=1, keepdims=True) # / K.dot(x[1], center_counts) return self.result # Nx1
def custom_loss(y_true, y_pred): """Args: y_true -- label vector of shape (batch_size, num_classes)""" samples_per_cluster = K.transpose( K.sum(y_true, axis=0, keepdims=True) + 1) # Add 1 to avoid division by zero centers = K.dot(K.transpose(y_true), features) / samples_per_cluster center_loss = 0.5 * K.sum(K.square(features - K.dot(y_true, centers))) center_dot_combinations = K.dot(centers, K.transpose(centers)) center_dot_combinations_normed = K.sqrt( K.square(center_dot_combinations)) pair_dist = center_dot_combinations / center_dot_combinations_normed # subtract diagonal of pair_dist which only contains ones pair_dist = pair_dist - K.eye(num_classes) pair_dist = pair_dist + 1 pair_dist = K.sum(pair_dist) island_loss = center_loss + pair_dist return categorical_crossentropy(y_true, y_pred) + island_loss
def call(self, inputs, **kwargs): pair1_embed, pair2_embed = inputs pair1_embed = K.l2_normalize(pair1_embed, axis=-1) pair2_embed = K.l2_normalize(pair2_embed, axis=-1) sim = K.dot(pair1_embed, K.transpose(pair2_embed)) sim = tf.linalg.tensor_diag_part(sim) return sim
def power_iteration(W, u, rounds=1): ''' Accroding the paper, we only need to do power iteration one time. ''' _u = u for i in range(rounds): _v = _l2normalizer(K.dot(_u, W)) _u = _l2normalizer(K.dot(_v, K.transpose(W))) W_sn = K.sum(K.dot(_u, W) * _v) return W_sn, _u, _v
def model(embedding_size, n_a): # word embedding matrix #word_vec = Input(shape=(embedding_size), name='Words') # batch, 300 word_vec = tf.constant(answer_emb, name='Words', dtype='float32') # preprocessing sentences into sentence vectors sentence = Input(shape=(T, embedding_size), name='Sentences') # batch, 50, 300 sentence_vec = Bidirectional(CuDNNGRU(units=n_a, return_sequences=False), name='Sentence_Vectors')(sentence) # batch, 300 # dot #product = Dot(axes=-1, normalize=False, name='Matrix')([word_vec, sentence_vec]) product = tf.matmul(word_vec, sentence_vec, transpose_b = True, name = 'Matrix') key_matrix = K.transpose(product) model = Model(inputs= sentence, outputs=key_matrix) return model
def gram_matrix(x): assert K.ndim(x) == 4 grams = list() for i in range(self.Batch_Size): img = x[i, :, :, :] if K.image_data_format() == 'channels_first': features = K.batch_flatten(img) else: features = K.batch_flatten( K.permute_dimensions(img, (2, 0, 1))) grams.append(K.dot(features, K.transpose(features))) gram = tf.keras.backend.stack(grams) return gram
def call(self, code_block: Tensor, training=False, **kwargs): # Note: all layers are wrapped with TimeDistributed, thus the shapes have number of # [batch size, timesteps (token length), features (1 the subtoken value), Etc] # each subtoken is considered a timestep # create a mask of the padding sequence of the input mask_vector = K.cast(K.equal(code_block, 0), dtype='float32') * -1e7 # mask_vector [batch size, max chunk length, 1] self.logger.info("mask_vector shape = {}".format(mask_vector.shape)) # code_block = Masking(mask_value=0, )(code_block) tokens_embedding = self.embedding_layer(code_block) self.logger.info("Tokens shape = {}".format(tokens_embedding.shape)) # tokens_embedding = [batch_size, max chunk length, embedding_dim] _, h_t = self.gru_layer(tokens_embedding, training=training) # h_t = [batch_size, k2) self.logger.info("h_t shape = {}".format(h_t.shape)) l_feat = self.attention_feature_layer([tokens_embedding, h_t]) self.logger.info("L_feat shape = {}".format(l_feat.shape)) # L_feat = [batch size, token length, k2] alpha = self.attention_weights_layer([l_feat, mask_vector]) self.logger.info("alpha shape = {}".format(alpha.shape)) # alpha = [batch size, token length] weights over embeddings # apply the attention to the input embedding n_hat = K.sum((K.expand_dims(alpha, axis=-1) * tokens_embedding), axis=1) self.logger.info("n_hat shape = {}".format(n_hat.shape)) # n_hat = [batch size, embedding dim] # embedding over all vocabulary E = self.embedding_layer.layer.embeddings self.logger.info("E shape = {}".format(E.shape)) # E = [vocabulary size, embedding dim] # Apply attention to the words over all embeddings n_hat_E = K.nn.math_ops.tensordot(E, K.transpose(n_hat), axes=[[1], [0]]) # n_hat_E = [vocabulary size, token length, batch size] n_hat_E = K.permute_dimensions(n_hat_E, [2, 1, 0]) self.logger.info("n_hat_E shape = {}".format(n_hat_E.shape)) # n_hat_E = [batch size, token length, vocabulary size] n = self.softmax_layer(K.bias_add(n_hat_E, self.bias)) self.logger.info("n shape = {}".format(n.shape)) # n = [batch size, vocabulary size] the probability of each token in the vocabulary return n
def gram_matrix(x): assert K.ndim(x) == 3 if image_dim_ordering() == 'th': features = K.batch_flatten(x) else: features = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1))) shape = K.shape(x) C, W, H = (shape[0],shape[1], shape[2]) cf = K.reshape(features ,(C,-1)) gram = K.dot(cf, K.transpose(cf)) / K.cast(C*W*H,dtype='float32') return gram
def call(self, x, mask=None): # x[0] is N x feature_dim, x[1] is N x num_classes onehot, self.centers is num_classes x feature_dim delta_centers = K.dot( K.transpose(x[1]), (K.dot(x[1], self.centers) - x[0])) # num_classes x feature_dim center_counts = K.sum(K.transpose(x[1]), axis=1, keepdims=True) + 1 # num_classes x 1 delta_centers /= center_counts new_centers = self.centers - self.alpha * delta_centers self.add_update((self.centers, new_centers), x) # self.add_update((self.counter, self.counter + 1), x) center_loss = x[0] - K.dot(x[1], self.centers) center_loss = K.sum(self.result**2, axis=1, keepdims=True) # / K.dot(x[1], center_counts) pair_dist = K.dot(K.transpose(self.centers), self.centers) pair_dist = pair_dist - K.dot(self.centers, self.centers) pair_dist = pair_dist / K.sqrt(K.square(pair_dist)) pair_dist = K.sum(pair_dist, keepdims=True) self.result = center_loss - pair_dist return self.result # Nx1